1/* 2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1982, 1986, 1988, 1990, 1993 30 * The Regents of the University of California. All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or without 33 * modification, are permitted provided that the following conditions 34 * are met: 35 * 1. Redistributions of source code must retain the above copyright 36 * notice, this list of conditions and the following disclaimer. 37 * 2. Redistributions in binary form must reproduce the above copyright 38 * notice, this list of conditions and the following disclaimer in the 39 * documentation and/or other materials provided with the distribution. 40 * 3. All advertising materials mentioning features or use of this software 41 * must display the following acknowledgement: 42 * This product includes software developed by the University of 43 * California, Berkeley and its contributors. 44 * 4. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 61 */ 62/* 63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 64 * support for mandatory and extensible security protections. This notice 65 * is included in support of clause 2.2 (b) of the Apple Public License, 66 * Version 2.0. 67 */ 68 69#define _IP_VHL 70 71#include <sys/param.h> 72#include <sys/systm.h> 73#include <sys/kernel.h> 74#include <sys/malloc.h> 75#include <sys/mbuf.h> 76#include <sys/protosw.h> 77#include <sys/socket.h> 78#include <sys/socketvar.h> 79#include <kern/locks.h> 80#include <sys/sysctl.h> 81#include <sys/mcache.h> 82#include <sys/kdebug.h> 83 84#include <machine/endian.h> 85#include <pexpert/pexpert.h> 86#include <mach/sdt.h> 87 88#include <libkern/OSAtomic.h> 89#include <libkern/OSByteOrder.h> 90 91#include <net/if.h> 92#include <net/if_dl.h> 93#include <net/if_types.h> 94#include <net/route.h> 95#include <net/ntstat.h> 96#include <net/net_osdep.h> 97#include <net/dlil.h> 98 99#include <netinet/in.h> 100#include <netinet/in_systm.h> 101#include <netinet/ip.h> 102#include <netinet/in_pcb.h> 103#include <netinet/in_var.h> 104#include <netinet/ip_var.h> 105#include <netinet/kpi_ipfilter_var.h> 106 107#if CONFIG_MACF_NET 108#include <security/mac_framework.h> 109#endif /* CONFIG_MACF_NET */ 110 111#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1) 112#define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3) 113#define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1) 114#define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1) 115 116#if IPSEC 117#include <netinet6/ipsec.h> 118#include <netkey/key.h> 119#if IPSEC_DEBUG 120#include <netkey/key_debug.h> 121#else 122#define KEYDEBUG(lev, arg) 123#endif 124#endif /* IPSEC */ 125 126#if IPFIREWALL 127#include <netinet/ip_fw.h> 128#if IPDIVERT 129#include <netinet/ip_divert.h> 130#endif /* IPDIVERT */ 131#endif /* IPFIREWALL */ 132 133#if DUMMYNET 134#include <netinet/ip_dummynet.h> 135#endif 136 137#if PF 138#include <net/pfvar.h> 139#endif /* PF */ 140 141#if IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG 142#define print_ip(a) \ 143 printf("%ld.%ld.%ld.%ld", (ntohl(a.s_addr) >> 24) & 0xFF, \ 144 (ntohl(a.s_addr) >> 16) & 0xFF, \ 145 (ntohl(a.s_addr) >> 8) & 0xFF, \ 146 (ntohl(a.s_addr)) & 0xFF); 147#endif /* IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG */ 148 149u_short ip_id; 150 151static void ip_out_cksum_stats(int, u_int32_t); 152static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); 153static int ip_optcopy(struct ip *, struct ip *); 154static int ip_pcbopts(int, struct mbuf **, struct mbuf *); 155static void imo_trace(struct ip_moptions *, int); 156static void ip_mloopback(struct ifnet *, struct ifnet *, struct mbuf *, 157 struct sockaddr_in *, int); 158static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int); 159 160extern struct ip_linklocal_stat ip_linklocal_stat; 161 162/* temporary: for testing */ 163#if IPSEC 164extern int ipsec_bypass; 165#endif 166 167static int ip_maxchainsent = 0; 168SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, 169 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0, 170 "use dlil_output_list"); 171#if DEBUG 172static int forge_ce = 0; 173SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, 174 CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0, 175 "Forge ECN CE"); 176#endif /* DEBUG */ 177 178static int ip_select_srcif_debug = 0; 179SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, 180 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0, 181 "log source interface selection debug info"); 182 183#define IMO_TRACE_HIST_SIZE 32 /* size of trace history */ 184 185/* For gdb */ 186__private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE; 187 188struct ip_moptions_dbg { 189 struct ip_moptions imo; /* ip_moptions */ 190 u_int16_t imo_refhold_cnt; /* # of IMO_ADDREF */ 191 u_int16_t imo_refrele_cnt; /* # of IMO_REMREF */ 192 /* 193 * Alloc and free callers. 194 */ 195 ctrace_t imo_alloc; 196 ctrace_t imo_free; 197 /* 198 * Circular lists of IMO_ADDREF and IMO_REMREF callers. 199 */ 200 ctrace_t imo_refhold[IMO_TRACE_HIST_SIZE]; 201 ctrace_t imo_refrele[IMO_TRACE_HIST_SIZE]; 202}; 203 204#if DEBUG 205static unsigned int imo_debug = 1; /* debugging (enabled) */ 206#else 207static unsigned int imo_debug; /* debugging (disabled) */ 208#endif /* !DEBUG */ 209static unsigned int imo_size; /* size of zone element */ 210static struct zone *imo_zone; /* zone for ip_moptions */ 211 212#define IMO_ZONE_MAX 64 /* maximum elements in zone */ 213#define IMO_ZONE_NAME "ip_moptions" /* zone name */ 214 215/* 216 * IP output. The packet in mbuf chain m contains a skeletal IP 217 * header (with len, off, ttl, proto, tos, src, dst). 218 * The mbuf chain containing the packet will be freed. 219 * The mbuf opt, if present, will not be freed. 220 */ 221int 222ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags, 223 struct ip_moptions *imo, struct ip_out_args *ipoa) 224{ 225 return (ip_output_list(m0, 0, opt, ro, flags, imo, ipoa)); 226} 227 228/* 229 * IP output. The packet in mbuf chain m contains a skeletal IP 230 * header (with len, off, ttl, proto, tos, src, dst). 231 * The mbuf chain containing the packet will be freed. 232 * The mbuf opt, if present, will not be freed. 233 * 234 * Route ro MUST be non-NULL; if ro->ro_rt is valid, route lookup would be 235 * skipped and ro->ro_rt would be used. Otherwise the result of route 236 * lookup is stored in ro->ro_rt. 237 * 238 * In the IP forwarding case, the packet will arrive with options already 239 * inserted, so must have a NULL opt pointer. 240 */ 241int 242ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt, 243 struct route *ro, int flags, struct ip_moptions *imo, 244 struct ip_out_args *ipoa) 245{ 246 struct ip *ip; 247 struct ifnet *ifp = NULL; /* not refcnt'd */ 248 struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt; 249 int hlen = sizeof (struct ip); 250 int len = 0, error = 0; 251 struct sockaddr_in *dst = NULL; 252 struct in_ifaddr *ia = NULL, *src_ia = NULL; 253 struct in_addr pkt_dst; 254 struct ipf_pktopts *ippo = NULL; 255 ipfilter_t inject_filter_ref = NULL; 256 struct mbuf *packetlist; 257 uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0; 258 unsigned int ifscope = IFSCOPE_NONE; 259 struct flowadv *adv = NULL; 260#if IPSEC 261 struct socket *so = NULL; 262 struct secpolicy *sp = NULL; 263#endif /* IPSEC */ 264#if IPFIREWALL 265 int ipfwoff; 266 struct sockaddr_in *next_hop_from_ipfwd_tag = NULL; 267#endif /* IPFIREWALL */ 268#if IPFIREWALL || DUMMYNET 269 struct m_tag *tag; 270#endif /* IPFIREWALL || DUMMYNET */ 271#if DUMMYNET 272 struct ip_out_args saved_ipoa; 273 struct sockaddr_in dst_buf; 274#endif /* DUMMYNET */ 275 struct { 276#if IPSEC 277 struct ipsec_output_state ipsec_state; 278#endif /* IPSEC */ 279#if IPFIREWALL || DUMMYNET 280 struct ip_fw_args args; 281#endif /* IPFIREWALL || DUMMYNET */ 282#if IPFIREWALL_FORWARD 283 struct route sro_fwd; 284#endif /* IPFIREWALL_FORWARD */ 285#if DUMMYNET 286 struct route saved_route; 287#endif /* DUMMYNET */ 288 struct ipf_pktopts ipf_pktopts; 289 } ipobz; 290#define ipsec_state ipobz.ipsec_state 291#define args ipobz.args 292#define sro_fwd ipobz.sro_fwd 293#define saved_route ipobz.saved_route 294#define ipf_pktopts ipobz.ipf_pktopts 295 union { 296 struct { 297 boolean_t select_srcif : 1; /* set once */ 298 boolean_t srcbound : 1; /* set once */ 299 boolean_t nocell : 1; /* set once */ 300 boolean_t isbroadcast : 1; 301 boolean_t didfilter : 1; 302#if IPFIREWALL_FORWARD 303 boolean_t fwd_rewrite_src : 1; 304#endif /* IPFIREWALL_FORWARD */ 305 }; 306 uint32_t raw; 307 } ipobf = { .raw = 0 }; 308 309 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); 310 311 VERIFY(m0->m_flags & M_PKTHDR); 312 packetlist = m0; 313 314 /* zero out {ipsec_state, args, sro_fwd, saved_route, ipf_pktops} */ 315 bzero(&ipobz, sizeof (ipobz)); 316 ippo = &ipf_pktopts; 317 318#if IPFIREWALL || DUMMYNET 319 if (SLIST_EMPTY(&m0->m_pkthdr.tags)) 320 goto ipfw_tags_done; 321 322 /* Grab info from mtags prepended to the chain */ 323#if DUMMYNET 324 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, 325 KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) { 326 struct dn_pkt_tag *dn_tag; 327 328 dn_tag = (struct dn_pkt_tag *)(tag+1); 329 args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule; 330 args.fwa_pf_rule = dn_tag->dn_pf_rule; 331 opt = NULL; 332 saved_route = dn_tag->dn_ro; 333 ro = &saved_route; 334 335 imo = NULL; 336 bcopy(&dn_tag->dn_dst, &dst_buf, sizeof (dst_buf)); 337 dst = &dst_buf; 338 ifp = dn_tag->dn_ifp; 339 flags = dn_tag->dn_flags; 340 if ((dn_tag->dn_flags & IP_OUTARGS)) { 341 saved_ipoa = dn_tag->dn_ipoa; 342 ipoa = &saved_ipoa; 343 } 344 345 m_tag_delete(m0, tag); 346 } 347#endif /* DUMMYNET */ 348 349#if IPDIVERT 350 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, 351 KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) { 352 struct divert_tag *div_tag; 353 354 div_tag = (struct divert_tag *)(tag+1); 355 args.fwa_divert_rule = div_tag->cookie; 356 357 m_tag_delete(m0, tag); 358 } 359#endif /* IPDIVERT */ 360 361#if IPFIREWALL 362 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID, 363 KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) { 364 struct ip_fwd_tag *ipfwd_tag; 365 366 ipfwd_tag = (struct ip_fwd_tag *)(tag+1); 367 next_hop_from_ipfwd_tag = ipfwd_tag->next_hop; 368 369 m_tag_delete(m0, tag); 370 } 371#endif /* IPFIREWALL */ 372 373ipfw_tags_done: 374#endif /* IPFIREWALL || DUMMYNET */ 375 376 m = m0; 377 m->m_pkthdr.pkt_flags &= ~(PKTF_LOOP|PKTF_IFAINFO); 378 379#if IPSEC 380 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) { 381 /* If packet is bound to an interface, check bound policies */ 382 if ((flags & IP_OUTARGS) && (ipoa != NULL) && 383 (ipoa->ipoa_flags & IPOAF_BOUND_IF) && 384 ipoa->ipoa_boundif != IFSCOPE_NONE) { 385 if (ipsec4_getpolicybyinterface(m, IPSEC_DIR_OUTBOUND, 386 &flags, ipoa, &sp) != 0) 387 goto bad; 388 } 389 } 390#endif /* IPSEC */ 391 392 VERIFY(ro != NULL); 393 394 if (ip_doscopedroute && (flags & IP_OUTARGS)) { 395 /* 396 * In the forwarding case, only the ifscope value is used, 397 * as source interface selection doesn't take place. 398 */ 399 if ((ipobf.select_srcif = (!(flags & IP_FORWARDING) && 400 (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) { 401 ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF; 402 } 403 404 if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) && 405 ipoa->ipoa_boundif != IFSCOPE_NONE) { 406 ifscope = ipoa->ipoa_boundif; 407 ipf_pktopts.ippo_flags |= 408 (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE)); 409 } 410 411 /* double negation needed for bool bit field */ 412 ipobf.srcbound = !!(ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR); 413 if (ipobf.srcbound) 414 ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR; 415 } else { 416 ipobf.select_srcif = FALSE; 417 ipobf.srcbound = FALSE; 418 ifscope = IFSCOPE_NONE; 419 if (flags & IP_OUTARGS) { 420 ipoa->ipoa_boundif = IFSCOPE_NONE; 421 ipoa->ipoa_flags &= ~(IPOAF_SELECT_SRCIF | 422 IPOAF_BOUND_IF | IPOAF_BOUND_SRCADDR); 423 } 424 } 425 426 if ((flags & IP_OUTARGS) && (ipoa->ipoa_flags & IPOAF_NO_CELLULAR)) { 427 ipobf.nocell = TRUE; 428 ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; 429 } 430 431 if (flags & IP_OUTARGS) { 432 adv = &ipoa->ipoa_flowadv; 433 adv->code = FADV_SUCCESS; 434 ipoa->ipoa_retflags = 0; 435 } 436 437#if DUMMYNET 438 if (args.fwa_ipfw_rule != NULL || args.fwa_pf_rule != NULL) { 439 /* dummynet already saw us */ 440 ip = mtod(m, struct ip *); 441 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 442 pkt_dst = ip->ip_dst; 443 if (ro->ro_rt != NULL) { 444 RT_LOCK_SPIN(ro->ro_rt); 445 ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; 446 if (ia) { 447 /* Become a regular mutex */ 448 RT_CONVERT_LOCK(ro->ro_rt); 449 IFA_ADDREF(&ia->ia_ifa); 450 } 451 RT_UNLOCK(ro->ro_rt); 452 } 453#if IPSEC 454 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) { 455 so = ipsec_getsocket(m); 456 (void) ipsec_setsocket(m, NULL); 457 } 458#endif /* IPSEC */ 459#if IPFIREWALL 460 if (args.fwa_ipfw_rule != NULL) 461 goto skip_ipsec; 462#endif /* IPFIREWALL */ 463 if (args.fwa_pf_rule != NULL) 464 goto sendit; 465 } 466#endif /* DUMMYNET */ 467 468#if IPSEC 469 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) { 470 so = ipsec_getsocket(m); 471 (void) ipsec_setsocket(m, NULL); 472 } 473#endif /* IPSEC */ 474 475loopit: 476 ipobf.isbroadcast = FALSE; 477 ipobf.didfilter = FALSE; 478#if IPFIREWALL_FORWARD 479 ipobf.fwd_rewrite_src = FALSE; 480#endif /* IPFIREWALL_FORWARD */ 481 482 VERIFY(m->m_flags & M_PKTHDR); 483 /* 484 * No need to proccess packet twice if we've already seen it. 485 */ 486 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) 487 inject_filter_ref = ipf_get_inject_filter(m); 488 else 489 inject_filter_ref = NULL; 490 491 if (opt) { 492 m = ip_insertoptions(m, opt, &len); 493 hlen = len; 494 /* Update the chain */ 495 if (m != m0) { 496 if (m0 == packetlist) 497 packetlist = m; 498 m0 = m; 499 } 500 } 501 ip = mtod(m, struct ip *); 502 503#if IPFIREWALL 504 /* 505 * rdar://8542331 506 * 507 * When dealing with a packet chain, we need to reset "next_hop" 508 * because "dst" may have been changed to the gateway address below 509 * for the previous packet of the chain. This could cause the route 510 * to be inavertandly changed to the route to the gateway address 511 * (instead of the route to the destination). 512 */ 513 args.fwa_next_hop = next_hop_from_ipfwd_tag; 514 pkt_dst = args.fwa_next_hop ? args.fwa_next_hop->sin_addr : ip->ip_dst; 515#else /* !IPFIREWALL */ 516 pkt_dst = ip->ip_dst; 517#endif /* !IPFIREWALL */ 518 519 /* 520 * We must not send if the packet is destined to network zero. 521 * RFC1122 3.2.1.3 (a) and (b). 522 */ 523 if (IN_ZERONET(ntohl(pkt_dst.s_addr))) { 524 error = EHOSTUNREACH; 525 goto bad; 526 } 527 528 /* 529 * Fill in IP header. 530 */ 531 if (!(flags & (IP_FORWARDING|IP_RAWOUTPUT))) { 532 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2); 533 ip->ip_off &= IP_DF; 534 ip->ip_id = ip_randomid(); 535 OSAddAtomic(1, &ipstat.ips_localout); 536 } else { 537 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 538 } 539 540#if DEBUG 541 /* For debugging, we let the stack forge congestion */ 542 if (forge_ce != 0 && 543 ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 || 544 (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) { 545 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE; 546 forge_ce--; 547 } 548#endif /* DEBUG */ 549 550 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr, 551 ip->ip_p, ip->ip_off, ip->ip_len); 552 553 dst = SIN(&ro->ro_dst); 554 555 /* 556 * If there is a cached route, 557 * check that it is to the same destination 558 * and is still up. If not, free it and try again. 559 * The address family should also be checked in case of sharing the 560 * cache with IPv6. 561 */ 562 563 if (ro->ro_rt != NULL) { 564 if (ROUTE_UNUSABLE(ro) && ip->ip_src.s_addr != INADDR_ANY && 565 !(flags & (IP_ROUTETOIF | IP_FORWARDING))) { 566 src_ia = ifa_foraddr(ip->ip_src.s_addr); 567 if (src_ia == NULL) { 568 error = EADDRNOTAVAIL; 569 goto bad; 570 } 571 IFA_REMREF(&src_ia->ia_ifa); 572 src_ia = NULL; 573 } 574 /* 575 * Test rt_flags without holding rt_lock for performance 576 * reasons; if the route is down it will hopefully be 577 * caught by the layer below (since it uses this route 578 * as a hint) or during the next transmit. 579 */ 580 if (ROUTE_UNUSABLE(ro) || dst->sin_family != AF_INET || 581 dst->sin_addr.s_addr != pkt_dst.s_addr) 582 ROUTE_RELEASE(ro); 583 584 /* 585 * If we're doing source interface selection, we may not 586 * want to use this route; only synch up the generation 587 * count otherwise. 588 */ 589 if (!ipobf.select_srcif && ro->ro_rt != NULL && 590 RT_GENID_OUTOFSYNC(ro->ro_rt)) 591 RT_GENID_SYNC(ro->ro_rt); 592 } 593 if (ro->ro_rt == NULL) { 594 bzero(dst, sizeof (*dst)); 595 dst->sin_family = AF_INET; 596 dst->sin_len = sizeof (*dst); 597 dst->sin_addr = pkt_dst; 598 } 599 /* 600 * If routing to interface only, 601 * short circuit routing lookup. 602 */ 603 if (flags & IP_ROUTETOIF) { 604 if (ia != NULL) 605 IFA_REMREF(&ia->ia_ifa); 606 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { 607 ia = ifatoia(ifa_ifwithnet(sintosa(dst))); 608 if (ia == NULL) { 609 OSAddAtomic(1, &ipstat.ips_noroute); 610 error = ENETUNREACH; 611 goto bad; 612 } 613 } 614 ifp = ia->ia_ifp; 615 ip->ip_ttl = 1; 616 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp); 617 /* 618 * For consistency with other cases below. Loopback 619 * multicast case is handled separately by ip_mloopback(). 620 */ 621 if ((ifp->if_flags & IFF_LOOPBACK) && 622 !IN_MULTICAST(ntohl(pkt_dst.s_addr))) { 623 m->m_pkthdr.rcvif = ifp; 624 ip_setsrcifaddr_info(m, ifp->if_index, NULL); 625 ip_setdstifaddr_info(m, ifp->if_index, NULL); 626 } 627 } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && 628 imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) { 629 /* 630 * Bypass the normal routing lookup for multicast 631 * packets if the interface is specified. 632 */ 633 ipobf.isbroadcast = FALSE; 634 if (ia != NULL) 635 IFA_REMREF(&ia->ia_ifa); 636 637 /* Macro takes reference on ia */ 638 IFP_TO_IA(ifp, ia); 639 } else { 640 struct ifaddr *ia0 = NULL; 641 boolean_t cloneok = FALSE; 642 /* 643 * Perform source interface selection; the source IP address 644 * must belong to one of the addresses of the interface used 645 * by the route. For performance reasons, do this only if 646 * there is no route, or if the routing table has changed, 647 * or if we haven't done source interface selection on this 648 * route (for this PCB instance) before. 649 */ 650 if (ipobf.select_srcif && 651 ip->ip_src.s_addr != INADDR_ANY && (ROUTE_UNUSABLE(ro) || 652 !(ro->ro_flags & ROF_SRCIF_SELECTED))) { 653 /* Find the source interface */ 654 ia0 = in_selectsrcif(ip, ro, ifscope); 655 656 /* 657 * If the source address belongs to a cellular interface 658 * and the caller forbids our using interfaces of such 659 * type, pretend that there is no route. 660 */ 661 if (ipobf.nocell && ia0 != NULL && 662 IFNET_IS_CELLULAR(ia0->ifa_ifp)) { 663 IFA_REMREF(ia0); 664 ia0 = NULL; 665 error = EHOSTUNREACH; 666 if (flags & IP_OUTARGS) 667 ipoa->ipoa_retflags |= IPOARF_IFDENIED; 668 goto bad; 669 } 670 671 /* 672 * If the source address is spoofed (in the case of 673 * IP_RAWOUTPUT on an unbounded socket), or if this 674 * is destined for local/loopback, just let it go out 675 * using the interface of the route. Otherwise, 676 * there's no interface having such an address, 677 * so bail out. 678 */ 679 if (ia0 == NULL && (!(flags & IP_RAWOUTPUT) || 680 ipobf.srcbound) && ifscope != lo_ifp->if_index) { 681 error = EADDRNOTAVAIL; 682 goto bad; 683 } 684 685 /* 686 * If the caller didn't explicitly specify the scope, 687 * pick it up from the source interface. If the cached 688 * route was wrong and was blown away as part of source 689 * interface selection, don't mask out RTF_PRCLONING 690 * since that route may have been allocated by the ULP, 691 * unless the IP header was created by the caller or 692 * the destination is IPv4 LLA. The check for the 693 * latter is needed because IPv4 LLAs are never scoped 694 * in the current implementation, and we don't want to 695 * replace the resolved IPv4 LLA route with one whose 696 * gateway points to that of the default gateway on 697 * the primary interface of the system. 698 */ 699 if (ia0 != NULL) { 700 if (ifscope == IFSCOPE_NONE) 701 ifscope = ia0->ifa_ifp->if_index; 702 cloneok = (!(flags & IP_RAWOUTPUT) && 703 !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)))); 704 } 705 } 706 707 /* 708 * If this is the case, we probably don't want to allocate 709 * a protocol-cloned route since we didn't get one from the 710 * ULP. This lets TCP do its thing, while not burdening 711 * forwarding or ICMP with the overhead of cloning a route. 712 * Of course, we still want to do any cloning requested by 713 * the link layer, as this is probably required in all cases 714 * for correct operation (as it is for ARP). 715 */ 716 if (ro->ro_rt == NULL) { 717 unsigned long ign = RTF_PRCLONING; 718 /* 719 * We make an exception here: if the destination 720 * address is INADDR_BROADCAST, allocate a protocol- 721 * cloned host route so that we end up with a route 722 * marked with the RTF_BROADCAST flag. Otherwise, 723 * we would end up referring to the default route, 724 * instead of creating a cloned host route entry. 725 * That would introduce inconsistencies between ULPs 726 * that allocate a route and those that don't. The 727 * RTF_BROADCAST route is important since we'd want 728 * to send out undirected IP broadcast packets using 729 * link-level broadcast address. Another exception 730 * is for ULP-created routes that got blown away by 731 * source interface selection (see above). 732 * 733 * These exceptions will no longer be necessary when 734 * the RTF_PRCLONING scheme is no longer present. 735 */ 736 if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) 737 ign &= ~RTF_PRCLONING; 738 739 /* 740 * Loosen the route lookup criteria if the ifscope 741 * corresponds to the loopback interface; this is 742 * needed to support Application Layer Gateways 743 * listening on loopback, in conjunction with packet 744 * filter redirection rules. The final source IP 745 * address will be rewritten by the packet filter 746 * prior to the RFC1122 loopback check below. 747 */ 748 if (ifscope == lo_ifp->if_index) 749 rtalloc_ign(ro, ign); 750 else 751 rtalloc_scoped_ign(ro, ign, ifscope); 752 753 /* 754 * If the route points to a cellular interface and the 755 * caller forbids our using interfaces of such type, 756 * pretend that there is no route. 757 */ 758 if (ipobf.nocell && ro->ro_rt != NULL) { 759 RT_LOCK_SPIN(ro->ro_rt); 760 if (IFNET_IS_CELLULAR(ro->ro_rt->rt_ifp)) { 761 RT_UNLOCK(ro->ro_rt); 762 ROUTE_RELEASE(ro); 763 if (flags & IP_OUTARGS) { 764 ipoa->ipoa_retflags |= 765 IPOARF_IFDENIED; 766 } 767 } else { 768 RT_UNLOCK(ro->ro_rt); 769 } 770 } 771 } 772 773 if (ro->ro_rt == NULL) { 774 OSAddAtomic(1, &ipstat.ips_noroute); 775 error = EHOSTUNREACH; 776 if (ia0 != NULL) { 777 IFA_REMREF(ia0); 778 ia0 = NULL; 779 } 780 goto bad; 781 } 782 783 if (ia != NULL) 784 IFA_REMREF(&ia->ia_ifa); 785 RT_LOCK_SPIN(ro->ro_rt); 786 ia = ifatoia(ro->ro_rt->rt_ifa); 787 if (ia != NULL) { 788 /* Become a regular mutex */ 789 RT_CONVERT_LOCK(ro->ro_rt); 790 IFA_ADDREF(&ia->ia_ifa); 791 } 792 /* 793 * Note: ia_ifp may not be the same as rt_ifp; the latter 794 * is what we use for determining outbound i/f, mtu, etc. 795 */ 796 ifp = ro->ro_rt->rt_ifp; 797 ro->ro_rt->rt_use++; 798 if (ro->ro_rt->rt_flags & RTF_GATEWAY) { 799 dst = SIN(ro->ro_rt->rt_gateway); 800 } 801 if (ro->ro_rt->rt_flags & RTF_HOST) { 802 /* double negation needed for bool bit field */ 803 ipobf.isbroadcast = 804 !!(ro->ro_rt->rt_flags & RTF_BROADCAST); 805 } else { 806 /* Become a regular mutex */ 807 RT_CONVERT_LOCK(ro->ro_rt); 808 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp); 809 } 810 /* 811 * For consistency with IPv6, as well as to ensure that 812 * IP_RECVIF is set correctly for packets that are sent 813 * to one of the local addresses. ia (rt_ifa) would have 814 * been fixed up by rt_setif for local routes. This 815 * would make it appear as if the packet arrives on the 816 * interface which owns the local address. Loopback 817 * multicast case is handled separately by ip_mloopback(). 818 */ 819 if (ia != NULL && (ifp->if_flags & IFF_LOOPBACK) && 820 !IN_MULTICAST(ntohl(pkt_dst.s_addr))) { 821 uint32_t srcidx; 822 823 m->m_pkthdr.rcvif = ia->ia_ifa.ifa_ifp; 824 825 if (ia0 != NULL) 826 srcidx = ia0->ifa_ifp->if_index; 827 else if ((ro->ro_flags & ROF_SRCIF_SELECTED) && 828 ro->ro_srcia != NULL) 829 srcidx = ro->ro_srcia->ifa_ifp->if_index; 830 else 831 srcidx = 0; 832 833 ip_setsrcifaddr_info(m, srcidx, NULL); 834 ip_setdstifaddr_info(m, 0, ia); 835 } 836 RT_UNLOCK(ro->ro_rt); 837 if (ia0 != NULL) { 838 IFA_REMREF(ia0); 839 ia0 = NULL; 840 } 841 } 842 843 if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { 844 struct ifnet *srcifp = NULL; 845 struct in_multi *inm; 846 u_int32_t vif; 847 u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL; 848 u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP; 849 850 m->m_flags |= M_MCAST; 851 /* 852 * IP destination address is multicast. Make sure "dst" 853 * still points to the address in "ro". (It may have been 854 * changed to point to a gateway address, above.) 855 */ 856 dst = SIN(&ro->ro_dst); 857 /* 858 * See if the caller provided any multicast options 859 */ 860 if (imo != NULL) { 861 IMO_LOCK(imo); 862 vif = imo->imo_multicast_vif; 863 ttl = imo->imo_multicast_ttl; 864 loop = imo->imo_multicast_loop; 865 if (!(flags & IP_RAWOUTPUT)) 866 ip->ip_ttl = ttl; 867 if (imo->imo_multicast_ifp != NULL) 868 ifp = imo->imo_multicast_ifp; 869 IMO_UNLOCK(imo); 870#if MROUTING 871 if (vif != -1 && (!(flags & IP_RAWOUTPUT) || 872 ip->ip_src.s_addr == INADDR_ANY)) 873 ip->ip_src.s_addr = ip_mcast_src(vif); 874#endif /* MROUTING */ 875 } else if (!(flags & IP_RAWOUTPUT)) { 876 vif = -1; 877 ip->ip_ttl = ttl; 878 } 879 /* 880 * Confirm that the outgoing interface supports multicast. 881 */ 882 if (imo == NULL || vif == -1) { 883 if (!(ifp->if_flags & IFF_MULTICAST)) { 884 OSAddAtomic(1, &ipstat.ips_noroute); 885 error = ENETUNREACH; 886 goto bad; 887 } 888 } 889 /* 890 * If source address not specified yet, use address 891 * of outgoing interface. 892 */ 893 if (ip->ip_src.s_addr == INADDR_ANY) { 894 struct in_ifaddr *ia1; 895 lck_rw_lock_shared(in_ifaddr_rwlock); 896 TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) { 897 IFA_LOCK_SPIN(&ia1->ia_ifa); 898 if (ia1->ia_ifp == ifp) { 899 ip->ip_src = IA_SIN(ia1)->sin_addr; 900 srcifp = ifp; 901 IFA_UNLOCK(&ia1->ia_ifa); 902 break; 903 } 904 IFA_UNLOCK(&ia1->ia_ifa); 905 } 906 lck_rw_done(in_ifaddr_rwlock); 907 if (ip->ip_src.s_addr == INADDR_ANY) { 908 error = ENETUNREACH; 909 goto bad; 910 } 911 } 912 913 in_multihead_lock_shared(); 914 IN_LOOKUP_MULTI(&pkt_dst, ifp, inm); 915 in_multihead_lock_done(); 916 if (inm != NULL && (imo == NULL || loop)) { 917 /* 918 * If we belong to the destination multicast group 919 * on the outgoing interface, and the caller did not 920 * forbid loopback, loop back a copy. 921 */ 922 if (!TAILQ_EMPTY(&ipv4_filters)) { 923 struct ipfilter *filter; 924 int seen = (inject_filter_ref == NULL); 925 926 if (imo != NULL) { 927 ipf_pktopts.ippo_flags |= 928 IPPOF_MCAST_OPTS; 929 ipf_pktopts.ippo_mcast_ifnet = ifp; 930 ipf_pktopts.ippo_mcast_ttl = ttl; 931 ipf_pktopts.ippo_mcast_loop = loop; 932 } 933 934 ipf_ref(); 935 936 /* 937 * 4135317 - always pass network byte 938 * order to filter 939 */ 940#if BYTE_ORDER != BIG_ENDIAN 941 HTONS(ip->ip_len); 942 HTONS(ip->ip_off); 943#endif 944 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { 945 if (seen == 0) { 946 if ((struct ipfilter *) 947 inject_filter_ref == filter) 948 seen = 1; 949 } else if (filter->ipf_filter. 950 ipf_output != NULL) { 951 errno_t result; 952 result = filter->ipf_filter. 953 ipf_output(filter-> 954 ipf_filter.cookie, 955 (mbuf_t *)&m, ippo); 956 if (result == EJUSTRETURN) { 957 ipf_unref(); 958 INM_REMREF(inm); 959 goto done; 960 } 961 if (result != 0) { 962 ipf_unref(); 963 INM_REMREF(inm); 964 goto bad; 965 } 966 } 967 } 968 969 /* set back to host byte order */ 970 ip = mtod(m, struct ip *); 971#if BYTE_ORDER != BIG_ENDIAN 972 NTOHS(ip->ip_len); 973 NTOHS(ip->ip_off); 974#endif 975 ipf_unref(); 976 ipobf.didfilter = TRUE; 977 } 978 ip_mloopback(srcifp, ifp, m, dst, hlen); 979 } 980#if MROUTING 981 else { 982 /* 983 * If we are acting as a multicast router, perform 984 * multicast forwarding as if the packet had just 985 * arrived on the interface to which we are about 986 * to send. The multicast forwarding function 987 * recursively calls this function, using the 988 * IP_FORWARDING flag to prevent infinite recursion. 989 * 990 * Multicasts that are looped back by ip_mloopback(), 991 * above, will be forwarded by the ip_input() routine, 992 * if necessary. 993 */ 994 if (ip_mrouter && !(flags & IP_FORWARDING)) { 995 /* 996 * Check if rsvp daemon is running. If not, 997 * don't set ip_moptions. This ensures that 998 * the packet is multicast and not just sent 999 * down one link as prescribed by rsvpd. 1000 */ 1001 if (!rsvp_on) 1002 imo = NULL; 1003 if (ip_mforward(ip, ifp, m, imo) != 0) { 1004 m_freem(m); 1005 if (inm != NULL) 1006 INM_REMREF(inm); 1007 OSAddAtomic(1, &ipstat.ips_cantforward); 1008 goto done; 1009 } 1010 } 1011 } 1012#endif /* MROUTING */ 1013 if (inm != NULL) 1014 INM_REMREF(inm); 1015 /* 1016 * Multicasts with a time-to-live of zero may be looped- 1017 * back, above, but must not be transmitted on a network. 1018 * Also, multicasts addressed to the loopback interface 1019 * are not sent -- the above call to ip_mloopback() will 1020 * loop back a copy if this host actually belongs to the 1021 * destination group on the loopback interface. 1022 */ 1023 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { 1024 m_freem(m); 1025 goto done; 1026 } 1027 1028 goto sendit; 1029 } 1030 /* 1031 * If source address not specified yet, use address 1032 * of outgoing interface. 1033 */ 1034 if (ip->ip_src.s_addr == INADDR_ANY) { 1035 IFA_LOCK_SPIN(&ia->ia_ifa); 1036 ip->ip_src = IA_SIN(ia)->sin_addr; 1037 IFA_UNLOCK(&ia->ia_ifa); 1038#if IPFIREWALL_FORWARD 1039 /* 1040 * Keep note that we did this - if the firewall changes 1041 * the next-hop, our interface may change, changing the 1042 * default source IP. It's a shame so much effort happens 1043 * twice. Oh well. 1044 */ 1045 ipobf.fwd_rewrite_src = TRUE; 1046#endif /* IPFIREWALL_FORWARD */ 1047 } 1048 1049 /* 1050 * Look for broadcast address and 1051 * and verify user is allowed to send 1052 * such a packet. 1053 */ 1054 if (ipobf.isbroadcast) { 1055 if (!(ifp->if_flags & IFF_BROADCAST)) { 1056 error = EADDRNOTAVAIL; 1057 goto bad; 1058 } 1059 if (!(flags & IP_ALLOWBROADCAST)) { 1060 error = EACCES; 1061 goto bad; 1062 } 1063 /* don't allow broadcast messages to be fragmented */ 1064 if ((u_short)ip->ip_len > ifp->if_mtu) { 1065 error = EMSGSIZE; 1066 goto bad; 1067 } 1068 m->m_flags |= M_BCAST; 1069 } else { 1070 m->m_flags &= ~M_BCAST; 1071 } 1072 1073sendit: 1074#if PF 1075 /* Invoke outbound packet filter */ 1076 if (PF_IS_ENABLED) { 1077 int rc; 1078 1079 m0 = m; /* Save for later */ 1080#if DUMMYNET 1081 args.fwa_m = m; 1082 args.fwa_next_hop = dst; 1083 args.fwa_oif = ifp; 1084 args.fwa_ro = ro; 1085 args.fwa_dst = dst; 1086 args.fwa_oflags = flags; 1087 if (flags & IP_OUTARGS) 1088 args.fwa_ipoa = ipoa; 1089 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, &args); 1090#else /* DUMMYNET */ 1091 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL); 1092#endif /* DUMMYNET */ 1093 if (rc != 0 || m == NULL) { 1094 /* Move to the next packet */ 1095 m = *mppn; 1096 1097 /* Skip ahead if first packet in list got dropped */ 1098 if (packetlist == m0) 1099 packetlist = m; 1100 1101 if (m != NULL) { 1102 m0 = m; 1103 /* Next packet in the chain */ 1104 goto loopit; 1105 } else if (packetlist != NULL) { 1106 /* No more packet; send down the chain */ 1107 goto sendchain; 1108 } 1109 /* Nothing left; we're done */ 1110 goto done; 1111 } 1112 m0 = m; 1113 ip = mtod(m, struct ip *); 1114 pkt_dst = ip->ip_dst; 1115 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 1116 } 1117#endif /* PF */ 1118 /* 1119 * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt 1120 */ 1121 if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || 1122 IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { 1123 ip_linklocal_stat.iplls_out_total++; 1124 if (ip->ip_ttl != MAXTTL) { 1125 ip_linklocal_stat.iplls_out_badttl++; 1126 ip->ip_ttl = MAXTTL; 1127 } 1128 } 1129 1130 if (!ipobf.didfilter && !TAILQ_EMPTY(&ipv4_filters)) { 1131 struct ipfilter *filter; 1132 int seen = (inject_filter_ref == NULL); 1133 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; 1134 1135 /* 1136 * Check that a TSO frame isn't passed to a filter. 1137 * This could happen if a filter is inserted while 1138 * TCP is sending the TSO packet. 1139 */ 1140 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { 1141 error = EMSGSIZE; 1142 goto bad; 1143 } 1144 1145 ipf_ref(); 1146 1147 /* 4135317 - always pass network byte order to filter */ 1148#if BYTE_ORDER != BIG_ENDIAN 1149 HTONS(ip->ip_len); 1150 HTONS(ip->ip_off); 1151#endif 1152 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { 1153 if (seen == 0) { 1154 if ((struct ipfilter *)inject_filter_ref == 1155 filter) 1156 seen = 1; 1157 } else if (filter->ipf_filter.ipf_output) { 1158 errno_t result; 1159 result = filter->ipf_filter. 1160 ipf_output(filter->ipf_filter.cookie, 1161 (mbuf_t *)&m, ippo); 1162 if (result == EJUSTRETURN) { 1163 ipf_unref(); 1164 goto done; 1165 } 1166 if (result != 0) { 1167 ipf_unref(); 1168 goto bad; 1169 } 1170 } 1171 } 1172 /* set back to host byte order */ 1173 ip = mtod(m, struct ip *); 1174#if BYTE_ORDER != BIG_ENDIAN 1175 NTOHS(ip->ip_len); 1176 NTOHS(ip->ip_off); 1177#endif 1178 ipf_unref(); 1179 } 1180 1181#if IPSEC 1182 /* temporary for testing only: bypass ipsec alltogether */ 1183 1184 if (ipsec_bypass != 0 || (flags & IP_NOIPSEC)) 1185 goto skip_ipsec; 1186 1187 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0); 1188 1189 /* May have been set above if packet was bound */ 1190 if (sp == NULL) { 1191 /* get SP for this packet */ 1192 if (so == NULL) 1193 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, 1194 flags, &error); 1195 else 1196 sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, 1197 so, &error); 1198 1199 if (sp == NULL) { 1200 IPSEC_STAT_INCREMENT(ipsecstat.out_inval); 1201 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1202 0, 0, 0, 0, 0); 1203 goto bad; 1204 } 1205 } 1206 1207 error = 0; 1208 1209 /* check policy */ 1210 switch (sp->policy) { 1211 case IPSEC_POLICY_DISCARD: 1212 case IPSEC_POLICY_GENERATE: 1213 /* 1214 * This packet is just discarded. 1215 */ 1216 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio); 1217 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1218 1, 0, 0, 0, 0); 1219 goto bad; 1220 1221 case IPSEC_POLICY_BYPASS: 1222 case IPSEC_POLICY_NONE: 1223 /* no need to do IPsec. */ 1224 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1225 2, 0, 0, 0, 0); 1226 goto skip_ipsec; 1227 1228 case IPSEC_POLICY_IPSEC: 1229 if (sp->req == NULL) { 1230 /* acquire a policy */ 1231 error = key_spdacquire(sp); 1232 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1233 3, 0, 0, 0, 0); 1234 goto bad; 1235 } 1236 if (sp->ipsec_if) { 1237 /* Verify the redirect to ipsec interface */ 1238 if (sp->ipsec_if == ifp) { 1239 /* Set policy for mbuf */ 1240 m->m_pkthdr.ipsec_policy = sp->id; 1241 goto skip_ipsec; 1242 } 1243 goto bad; 1244 } 1245 break; 1246 1247 case IPSEC_POLICY_ENTRUST: 1248 default: 1249 printf("ip_output: Invalid policy found. %d\n", sp->policy); 1250 } 1251 { 1252 ipsec_state.m = m; 1253 if (flags & IP_ROUTETOIF) { 1254 bzero(&ipsec_state.ro, sizeof (ipsec_state.ro)); 1255 } else { 1256 route_copyout(&ipsec_state.ro, ro, sizeof (ipsec_state.ro)); 1257 } 1258 ipsec_state.dst = SA(dst); 1259 1260 ip->ip_sum = 0; 1261 1262 /* 1263 * XXX 1264 * delayed checksums are not currently compatible with IPsec 1265 */ 1266 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) 1267 in_delayed_cksum(m); 1268 1269#if BYTE_ORDER != BIG_ENDIAN 1270 HTONS(ip->ip_len); 1271 HTONS(ip->ip_off); 1272#endif 1273 1274 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, 1275 struct ip *, ip, struct ifnet *, ifp, 1276 struct ip *, ip, struct ip6_hdr *, NULL); 1277 1278 error = ipsec4_output(&ipsec_state, sp, flags); 1279 1280 m0 = m = ipsec_state.m; 1281 1282#if DUMMYNET 1283 /* 1284 * If we're about to use the route in ipsec_state 1285 * and this came from dummynet, cleaup now. 1286 */ 1287 if (ro == &saved_route && 1288 (!(flags & IP_ROUTETOIF) || ipsec_state.tunneled)) 1289 ROUTE_RELEASE(ro); 1290#endif /* DUMMYNET */ 1291 1292 if (flags & IP_ROUTETOIF) { 1293 /* 1294 * if we have tunnel mode SA, we may need to ignore 1295 * IP_ROUTETOIF. 1296 */ 1297 if (ipsec_state.tunneled) { 1298 flags &= ~IP_ROUTETOIF; 1299 ro = &ipsec_state.ro; 1300 } 1301 } else { 1302 ro = &ipsec_state.ro; 1303 } 1304 dst = SIN(ipsec_state.dst); 1305 if (error) { 1306 /* mbuf is already reclaimed in ipsec4_output. */ 1307 m0 = NULL; 1308 switch (error) { 1309 case EHOSTUNREACH: 1310 case ENETUNREACH: 1311 case EMSGSIZE: 1312 case ENOBUFS: 1313 case ENOMEM: 1314 break; 1315 default: 1316 printf("ip4_output (ipsec): error code %d\n", error); 1317 /* FALLTHRU */ 1318 case ENOENT: 1319 /* don't show these error codes to the user */ 1320 error = 0; 1321 break; 1322 } 1323 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1324 4, 0, 0, 0, 0); 1325 goto bad; 1326 } 1327 } 1328 1329 /* be sure to update variables that are affected by ipsec4_output() */ 1330 ip = mtod(m, struct ip *); 1331 1332#ifdef _IP_VHL 1333 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 1334#else /* !_IP_VHL */ 1335 hlen = ip->ip_hl << 2; 1336#endif /* !_IP_VHL */ 1337 /* Check that there wasn't a route change and src is still valid */ 1338 if (ROUTE_UNUSABLE(ro)) { 1339 ROUTE_RELEASE(ro); 1340 VERIFY(src_ia == NULL); 1341 if (ip->ip_src.s_addr != INADDR_ANY && 1342 !(flags & (IP_ROUTETOIF | IP_FORWARDING)) && 1343 (src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL) { 1344 error = EADDRNOTAVAIL; 1345 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1346 5, 0, 0, 0, 0); 1347 goto bad; 1348 } 1349 if (src_ia != NULL) { 1350 IFA_REMREF(&src_ia->ia_ifa); 1351 src_ia = NULL; 1352 } 1353 } 1354 1355 if (ro->ro_rt == NULL) { 1356 if (!(flags & IP_ROUTETOIF)) { 1357 printf("%s: can't update route after " 1358 "IPsec processing\n", __func__); 1359 error = EHOSTUNREACH; /* XXX */ 1360 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1361 6, 0, 0, 0, 0); 1362 goto bad; 1363 } 1364 } else { 1365 if (ia != NULL) 1366 IFA_REMREF(&ia->ia_ifa); 1367 RT_LOCK_SPIN(ro->ro_rt); 1368 ia = ifatoia(ro->ro_rt->rt_ifa); 1369 if (ia != NULL) { 1370 /* Become a regular mutex */ 1371 RT_CONVERT_LOCK(ro->ro_rt); 1372 IFA_ADDREF(&ia->ia_ifa); 1373 } 1374 ifp = ro->ro_rt->rt_ifp; 1375 RT_UNLOCK(ro->ro_rt); 1376 } 1377 1378 /* make it flipped, again. */ 1379#if BYTE_ORDER != BIG_ENDIAN 1380 NTOHS(ip->ip_len); 1381 NTOHS(ip->ip_off); 1382#endif 1383 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1384 7, 0xff, 0xff, 0xff, 0xff); 1385 1386 /* Pass to filters again */ 1387 if (!TAILQ_EMPTY(&ipv4_filters)) { 1388 struct ipfilter *filter; 1389 1390 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; 1391 1392 /* 1393 * Check that a TSO frame isn't passed to a filter. 1394 * This could happen if a filter is inserted while 1395 * TCP is sending the TSO packet. 1396 */ 1397 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) { 1398 error = EMSGSIZE; 1399 goto bad; 1400 } 1401 1402 ipf_ref(); 1403 1404 /* 4135317 - always pass network byte order to filter */ 1405#if BYTE_ORDER != BIG_ENDIAN 1406 HTONS(ip->ip_len); 1407 HTONS(ip->ip_off); 1408#endif 1409 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { 1410 if (filter->ipf_filter.ipf_output) { 1411 errno_t result; 1412 result = filter->ipf_filter. 1413 ipf_output(filter->ipf_filter.cookie, 1414 (mbuf_t *)&m, ippo); 1415 if (result == EJUSTRETURN) { 1416 ipf_unref(); 1417 goto done; 1418 } 1419 if (result != 0) { 1420 ipf_unref(); 1421 goto bad; 1422 } 1423 } 1424 } 1425 /* set back to host byte order */ 1426 ip = mtod(m, struct ip *); 1427#if BYTE_ORDER != BIG_ENDIAN 1428 NTOHS(ip->ip_len); 1429 NTOHS(ip->ip_off); 1430#endif 1431 ipf_unref(); 1432 } 1433skip_ipsec: 1434#endif /* IPSEC */ 1435 1436#if IPFIREWALL 1437 /* 1438 * Check with the firewall... 1439 * but not if we are already being fwd'd from a firewall. 1440 */ 1441 if (fw_enable && IPFW_LOADED && !args.fwa_next_hop) { 1442 struct sockaddr_in *old = dst; 1443 1444 args.fwa_m = m; 1445 args.fwa_next_hop = dst; 1446 args.fwa_oif = ifp; 1447 ipfwoff = ip_fw_chk_ptr(&args); 1448 m = args.fwa_m; 1449 dst = args.fwa_next_hop; 1450 1451 /* 1452 * On return we must do the following: 1453 * IP_FW_PORT_DENY_FLAG -> drop the pkt (XXX new) 1454 * 1<=off<= 0xffff -> DIVERT 1455 * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe 1456 * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet 1457 * dst != old -> IPFIREWALL_FORWARD 1458 * off==0, dst==old -> accept 1459 * If some of the above modules is not compiled in, then 1460 * we should't have to check the corresponding condition 1461 * (because the ipfw control socket should not accept 1462 * unsupported rules), but better play safe and drop 1463 * packets in case of doubt. 1464 */ 1465 m0 = m; 1466 if ((ipfwoff & IP_FW_PORT_DENY_FLAG) || m == NULL) { 1467 if (m) 1468 m_freem(m); 1469 error = EACCES; 1470 goto done; 1471 } 1472 ip = mtod(m, struct ip *); 1473 1474 if (ipfwoff == 0 && dst == old) { /* common case */ 1475 goto pass; 1476 } 1477#if DUMMYNET 1478 if (DUMMYNET_LOADED && (ipfwoff & IP_FW_PORT_DYNT_FLAG) != 0) { 1479 /* 1480 * pass the pkt to dummynet. Need to include 1481 * pipe number, m, ifp, ro, dst because these are 1482 * not recomputed in the next pass. 1483 * All other parameters have been already used and 1484 * so they are not needed anymore. 1485 * XXX note: if the ifp or ro entry are deleted 1486 * while a pkt is in dummynet, we are in trouble! 1487 */ 1488 args.fwa_ro = ro; 1489 args.fwa_dst = dst; 1490 args.fwa_oflags = flags; 1491 if (flags & IP_OUTARGS) 1492 args.fwa_ipoa = ipoa; 1493 1494 error = ip_dn_io_ptr(m, ipfwoff & 0xffff, DN_TO_IP_OUT, 1495 &args, DN_CLIENT_IPFW); 1496 goto done; 1497 } 1498#endif /* DUMMYNET */ 1499#if IPDIVERT 1500 if (ipfwoff != 0 && (ipfwoff & IP_FW_PORT_DYNT_FLAG) == 0) { 1501 struct mbuf *clone = NULL; 1502 1503 /* Clone packet if we're doing a 'tee' */ 1504 if ((ipfwoff & IP_FW_PORT_TEE_FLAG) != 0) 1505 clone = m_dup(m, M_DONTWAIT); 1506 /* 1507 * XXX 1508 * delayed checksums are not currently compatible 1509 * with divert sockets. 1510 */ 1511 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) 1512 in_delayed_cksum(m); 1513 1514 /* Restore packet header fields to original values */ 1515 1516#if BYTE_ORDER != BIG_ENDIAN 1517 HTONS(ip->ip_len); 1518 HTONS(ip->ip_off); 1519#endif 1520 1521 /* Deliver packet to divert input routine */ 1522 divert_packet(m, 0, ipfwoff & 0xffff, 1523 args.fwa_divert_rule); 1524 1525 /* If 'tee', continue with original packet */ 1526 if (clone != NULL) { 1527 m0 = m = clone; 1528 ip = mtod(m, struct ip *); 1529 goto pass; 1530 } 1531 goto done; 1532 } 1533#endif /* IPDIVERT */ 1534#if IPFIREWALL_FORWARD 1535 /* 1536 * Here we check dst to make sure it's directly reachable on 1537 * the interface we previously thought it was. 1538 * If it isn't (which may be likely in some situations) we have 1539 * to re-route it (ie, find a route for the next-hop and the 1540 * associated interface) and set them here. This is nested 1541 * forwarding which in most cases is undesirable, except where 1542 * such control is nigh impossible. So we do it here. 1543 * And I'm babbling. 1544 */ 1545 if (ipfwoff == 0 && old != dst) { 1546 struct in_ifaddr *ia_fw; 1547 struct route *ro_fwd = &sro_fwd; 1548 1549#if IPFIREWALL_FORWARD_DEBUG 1550 printf("IPFIREWALL_FORWARD: New dst ip: "); 1551 print_ip(dst->sin_addr); 1552 printf("\n"); 1553#endif /* IPFIREWALL_FORWARD_DEBUG */ 1554 /* 1555 * We need to figure out if we have been forwarded 1556 * to a local socket. If so then we should somehow 1557 * "loop back" to ip_input, and get directed to the 1558 * PCB as if we had received this packet. This is 1559 * because it may be dificult to identify the packets 1560 * you want to forward until they are being output 1561 * and have selected an interface. (e.g. locally 1562 * initiated packets) If we used the loopback inteface, 1563 * we would not be able to control what happens 1564 * as the packet runs through ip_input() as 1565 * it is done through a ISR. 1566 */ 1567 lck_rw_lock_shared(in_ifaddr_rwlock); 1568 TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) { 1569 /* 1570 * If the addr to forward to is one 1571 * of ours, we pretend to 1572 * be the destination for this packet. 1573 */ 1574 IFA_LOCK_SPIN(&ia_fw->ia_ifa); 1575 if (IA_SIN(ia_fw)->sin_addr.s_addr == 1576 dst->sin_addr.s_addr) { 1577 IFA_UNLOCK(&ia_fw->ia_ifa); 1578 break; 1579 } 1580 IFA_UNLOCK(&ia_fw->ia_ifa); 1581 } 1582 lck_rw_done(in_ifaddr_rwlock); 1583 if (ia_fw) { 1584 /* tell ip_input "dont filter" */ 1585 struct m_tag *fwd_tag; 1586 struct ip_fwd_tag *ipfwd_tag; 1587 1588 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, 1589 KERNEL_TAG_TYPE_IPFORWARD, 1590 sizeof (*ipfwd_tag), M_NOWAIT, m); 1591 if (fwd_tag == NULL) { 1592 error = ENOBUFS; 1593 goto bad; 1594 } 1595 1596 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); 1597 ipfwd_tag->next_hop = args.fwa_next_hop; 1598 1599 m_tag_prepend(m, fwd_tag); 1600 1601 if (m->m_pkthdr.rcvif == NULL) 1602 m->m_pkthdr.rcvif = lo_ifp; 1603 1604#if BYTE_ORDER != BIG_ENDIAN 1605 HTONS(ip->ip_len); 1606 HTONS(ip->ip_off); 1607#endif 1608 mbuf_outbound_finalize(m, PF_INET, 0); 1609 1610 /* 1611 * we need to call dlil_output to run filters 1612 * and resync to avoid recursion loops. 1613 */ 1614 if (lo_ifp) { 1615 dlil_output(lo_ifp, PF_INET, m, NULL, 1616 SA(dst), 0, adv); 1617 } else { 1618 printf("%s: no loopback ifp for " 1619 "forwarding!!!\n", __func__); 1620 } 1621 goto done; 1622 } 1623 /* 1624 * Some of the logic for this was nicked from above. 1625 * 1626 * This rewrites the cached route in a local PCB. 1627 * Is this what we want to do? 1628 */ 1629 ROUTE_RELEASE(ro_fwd); 1630 bcopy(dst, &ro_fwd->ro_dst, sizeof (*dst)); 1631 1632 rtalloc_ign(ro_fwd, RTF_PRCLONING); 1633 1634 if (ro_fwd->ro_rt == NULL) { 1635 OSAddAtomic(1, &ipstat.ips_noroute); 1636 error = EHOSTUNREACH; 1637 goto bad; 1638 } 1639 1640 RT_LOCK_SPIN(ro_fwd->ro_rt); 1641 ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa); 1642 if (ia_fw != NULL) { 1643 /* Become a regular mutex */ 1644 RT_CONVERT_LOCK(ro_fwd->ro_rt); 1645 IFA_ADDREF(&ia_fw->ia_ifa); 1646 } 1647 ifp = ro_fwd->ro_rt->rt_ifp; 1648 ro_fwd->ro_rt->rt_use++; 1649 if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) 1650 dst = SIN(ro_fwd->ro_rt->rt_gateway); 1651 if (ro_fwd->ro_rt->rt_flags & RTF_HOST) { 1652 /* double negation needed for bool bit field */ 1653 ipobf.isbroadcast = 1654 !!(ro_fwd->ro_rt->rt_flags & RTF_BROADCAST); 1655 } else { 1656 /* Become a regular mutex */ 1657 RT_CONVERT_LOCK(ro_fwd->ro_rt); 1658 ipobf.isbroadcast = 1659 in_broadcast(dst->sin_addr, ifp); 1660 } 1661 RT_UNLOCK(ro_fwd->ro_rt); 1662 ROUTE_RELEASE(ro); 1663 ro->ro_rt = ro_fwd->ro_rt; 1664 ro_fwd->ro_rt = NULL; 1665 dst = SIN(&ro_fwd->ro_dst); 1666 1667 /* 1668 * If we added a default src ip earlier, 1669 * which would have been gotten from the-then 1670 * interface, do it again, from the new one. 1671 */ 1672 if (ia_fw != NULL) { 1673 if (ipobf.fwd_rewrite_src) { 1674 IFA_LOCK_SPIN(&ia_fw->ia_ifa); 1675 ip->ip_src = IA_SIN(ia_fw)->sin_addr; 1676 IFA_UNLOCK(&ia_fw->ia_ifa); 1677 } 1678 IFA_REMREF(&ia_fw->ia_ifa); 1679 } 1680 goto pass; 1681 } 1682#endif /* IPFIREWALL_FORWARD */ 1683 /* 1684 * if we get here, none of the above matches, and 1685 * we have to drop the pkt 1686 */ 1687 m_freem(m); 1688 error = EACCES; /* not sure this is the right error msg */ 1689 goto done; 1690 } 1691 1692pass: 1693#endif /* IPFIREWALL */ 1694 1695 /* 127/8 must not appear on wire - RFC1122 */ 1696 if (!(ifp->if_flags & IFF_LOOPBACK) && 1697 ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || 1698 (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) { 1699 OSAddAtomic(1, &ipstat.ips_badaddr); 1700 m_freem(m); 1701 error = EADDRNOTAVAIL; 1702 goto done; 1703 } 1704 1705 ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2), 1706 ip->ip_len, &sw_csum); 1707 1708 /* 1709 * If small enough for interface, or the interface will take 1710 * care of the fragmentation for us, can just send directly. 1711 */ 1712 if ((u_short)ip->ip_len <= ifp->if_mtu || TSO_IPV4_OK(ifp, m) || 1713 (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) { 1714#if BYTE_ORDER != BIG_ENDIAN 1715 HTONS(ip->ip_len); 1716 HTONS(ip->ip_off); 1717#endif 1718 1719 ip->ip_sum = 0; 1720 if (sw_csum & CSUM_DELAY_IP) { 1721 ip->ip_sum = ip_cksum_hdr_out(m, hlen); 1722 sw_csum &= ~CSUM_DELAY_IP; 1723 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; 1724 } 1725 1726#if IPSEC 1727 /* clean ipsec history once it goes out of the node */ 1728 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) 1729 ipsec_delaux(m); 1730#endif /* IPSEC */ 1731 if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) && 1732 (m->m_pkthdr.tso_segsz > 0)) 1733 scnt += m->m_pkthdr.len / m->m_pkthdr.tso_segsz; 1734 else 1735 scnt++; 1736 1737 if (packetchain == 0) { 1738 if (ro->ro_rt != NULL && nstat_collect) 1739 nstat_route_tx(ro->ro_rt, scnt, 1740 m->m_pkthdr.len, 0); 1741 1742 error = dlil_output(ifp, PF_INET, m, ro->ro_rt, 1743 SA(dst), 0, adv); 1744 scnt = 0; 1745 goto done; 1746 } else { 1747 /* 1748 * packet chaining allows us to reuse the 1749 * route for all packets 1750 */ 1751 bytecnt += m->m_pkthdr.len; 1752 mppn = &m->m_nextpkt; 1753 m = m->m_nextpkt; 1754 if (m == NULL) { 1755#if PF 1756sendchain: 1757#endif /* PF */ 1758 if (pktcnt > ip_maxchainsent) 1759 ip_maxchainsent = pktcnt; 1760 if (ro->ro_rt != NULL && nstat_collect) 1761 nstat_route_tx(ro->ro_rt, scnt, 1762 bytecnt, 0); 1763 1764 error = dlil_output(ifp, PF_INET, packetlist, 1765 ro->ro_rt, SA(dst), 0, adv); 1766 pktcnt = 0; 1767 scnt = 0; 1768 bytecnt = 0; 1769 goto done; 1770 1771 } 1772 m0 = m; 1773 pktcnt++; 1774 goto loopit; 1775 } 1776 } 1777 /* 1778 * Too large for interface; fragment if possible. 1779 * Must be able to put at least 8 bytes per fragment. 1780 * Balk when DF bit is set or the interface didn't support TSO. 1781 */ 1782 if ((ip->ip_off & IP_DF) || pktcnt > 0 || 1783 (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) { 1784 error = EMSGSIZE; 1785 /* 1786 * This case can happen if the user changed the MTU 1787 * of an interface after enabling IP on it. Because 1788 * most netifs don't keep track of routes pointing to 1789 * them, there is no way for one to update all its 1790 * routes when the MTU is changed. 1791 */ 1792 if (ro->ro_rt) { 1793 RT_LOCK_SPIN(ro->ro_rt); 1794 if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && 1795 !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) && 1796 (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { 1797 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; 1798 } 1799 RT_UNLOCK(ro->ro_rt); 1800 } 1801 if (pktcnt > 0) { 1802 m0 = packetlist; 1803 } 1804 OSAddAtomic(1, &ipstat.ips_cantfrag); 1805 goto bad; 1806 } 1807 1808 error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum); 1809 if (error != 0) { 1810 m0 = m = NULL; 1811 goto bad; 1812 } 1813 1814 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr, 1815 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len); 1816 1817 for (m = m0; m; m = m0) { 1818 m0 = m->m_nextpkt; 1819 m->m_nextpkt = 0; 1820#if IPSEC 1821 /* clean ipsec history once it goes out of the node */ 1822 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) 1823 ipsec_delaux(m); 1824#endif /* IPSEC */ 1825 if (error == 0) { 1826 if ((packetchain != 0) && (pktcnt > 0)) { 1827 panic("%s: mix of packet in packetlist is " 1828 "wrong=%p", __func__, packetlist); 1829 /* NOTREACHED */ 1830 } 1831 if (ro->ro_rt != NULL && nstat_collect) { 1832 nstat_route_tx(ro->ro_rt, 1, 1833 m->m_pkthdr.len, 0); 1834 } 1835 error = dlil_output(ifp, PF_INET, m, ro->ro_rt, 1836 SA(dst), 0, adv); 1837 } else { 1838 m_freem(m); 1839 } 1840 } 1841 1842 if (error == 0) 1843 OSAddAtomic(1, &ipstat.ips_fragmented); 1844 1845done: 1846 if (ia != NULL) { 1847 IFA_REMREF(&ia->ia_ifa); 1848 ia = NULL; 1849 } 1850#if IPSEC 1851 ROUTE_RELEASE(&ipsec_state.ro); 1852 if (sp != NULL) { 1853 KEYDEBUG(KEYDEBUG_IPSEC_STAMP, 1854 printf("DP ip_output call free SP:%x\n", sp)); 1855 key_freesp(sp, KEY_SADB_UNLOCKED); 1856 } 1857#endif /* IPSEC */ 1858#if DUMMYNET 1859 ROUTE_RELEASE(&saved_route); 1860#endif /* DUMMYNET */ 1861#if IPFIREWALL_FORWARD 1862 ROUTE_RELEASE(&sro_fwd); 1863#endif /* IPFIREWALL_FORWARD */ 1864 1865 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0); 1866 return (error); 1867bad: 1868 m_freem(m0); 1869 goto done; 1870 1871#undef ipsec_state 1872#undef args 1873#undef sro_fwd 1874#undef saved_route 1875#undef ipf_pktopts 1876} 1877 1878int 1879ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum) 1880{ 1881 struct ip *ip, *mhip; 1882 int len, hlen, mhlen, firstlen, off, error = 0; 1883 struct mbuf **mnext = &m->m_nextpkt, *m0; 1884 int nfrags = 1; 1885 1886 ip = mtod(m, struct ip *); 1887#ifdef _IP_VHL 1888 hlen = IP_VHL_HL(ip->ip_vhl) << 2; 1889#else /* !_IP_VHL */ 1890 hlen = ip->ip_hl << 2; 1891#endif /* !_IP_VHL */ 1892 1893 firstlen = len = (mtu - hlen) &~ 7; 1894 if (len < 8) { 1895 m_freem(m); 1896 return (EMSGSIZE); 1897 } 1898 1899 /* 1900 * if the interface will not calculate checksums on 1901 * fragmented packets, then do it here. 1902 */ 1903 if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) && 1904 !(ifp->if_hwassist & CSUM_IP_FRAGS)) 1905 in_delayed_cksum(m); 1906 1907 /* 1908 * Loop through length of segment after first fragment, 1909 * make new header and copy data of each part and link onto chain. 1910 */ 1911 m0 = m; 1912 mhlen = sizeof (struct ip); 1913 for (off = hlen + len; off < (u_short)ip->ip_len; off += len) { 1914 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */ 1915 if (m == NULL) { 1916 error = ENOBUFS; 1917 OSAddAtomic(1, &ipstat.ips_odropped); 1918 goto sendorfree; 1919 } 1920 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; 1921 m->m_data += max_linkhdr; 1922 mhip = mtod(m, struct ip *); 1923 *mhip = *ip; 1924 if (hlen > sizeof (struct ip)) { 1925 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); 1926 mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2); 1927 } 1928 m->m_len = mhlen; 1929 mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF); 1930 if (ip->ip_off & IP_MF) 1931 mhip->ip_off |= IP_MF; 1932 if (off + len >= (u_short)ip->ip_len) 1933 len = (u_short)ip->ip_len - off; 1934 else 1935 mhip->ip_off |= IP_MF; 1936 mhip->ip_len = htons((u_short)(len + mhlen)); 1937 m->m_next = m_copy(m0, off, len); 1938 if (m->m_next == NULL) { 1939 (void) m_free(m); 1940 error = ENOBUFS; /* ??? */ 1941 OSAddAtomic(1, &ipstat.ips_odropped); 1942 goto sendorfree; 1943 } 1944 m->m_pkthdr.len = mhlen + len; 1945 m->m_pkthdr.rcvif = NULL; 1946 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; 1947 1948 M_COPY_CLASSIFIER(m, m0); 1949 M_COPY_PFTAG(m, m0); 1950 1951#if CONFIG_MACF_NET 1952 mac_netinet_fragment(m0, m); 1953#endif /* CONFIG_MACF_NET */ 1954 1955#if BYTE_ORDER != BIG_ENDIAN 1956 HTONS(mhip->ip_off); 1957#endif 1958 1959 mhip->ip_sum = 0; 1960 if (sw_csum & CSUM_DELAY_IP) { 1961 mhip->ip_sum = ip_cksum_hdr_out(m, mhlen); 1962 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; 1963 } 1964 *mnext = m; 1965 mnext = &m->m_nextpkt; 1966 nfrags++; 1967 } 1968 OSAddAtomic(nfrags, &ipstat.ips_ofragments); 1969 1970 /* set first/last markers for fragment chain */ 1971 m->m_flags |= M_LASTFRAG; 1972 m0->m_flags |= M_FIRSTFRAG | M_FRAG; 1973 m0->m_pkthdr.csum_data = nfrags; 1974 1975 /* 1976 * Update first fragment by trimming what's been copied out 1977 * and updating header, then send each fragment (in order). 1978 */ 1979 m = m0; 1980 m_adj(m, hlen + firstlen - (u_short)ip->ip_len); 1981 m->m_pkthdr.len = hlen + firstlen; 1982 ip->ip_len = htons((u_short)m->m_pkthdr.len); 1983 ip->ip_off |= IP_MF; 1984 1985#if BYTE_ORDER != BIG_ENDIAN 1986 HTONS(ip->ip_off); 1987#endif 1988 1989 ip->ip_sum = 0; 1990 if (sw_csum & CSUM_DELAY_IP) { 1991 ip->ip_sum = ip_cksum_hdr_out(m, hlen); 1992 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; 1993 } 1994sendorfree: 1995 if (error) 1996 m_freem_list(m0); 1997 1998 return (error); 1999} 2000 2001static void 2002ip_out_cksum_stats(int proto, u_int32_t len) 2003{ 2004 switch (proto) { 2005 case IPPROTO_TCP: 2006 tcp_out_cksum_stats(len); 2007 break; 2008 case IPPROTO_UDP: 2009 udp_out_cksum_stats(len); 2010 break; 2011 default: 2012 /* keep only TCP or UDP stats for now */ 2013 break; 2014 } 2015} 2016 2017/* 2018 * Process a delayed payload checksum calculation (outbound path.) 2019 * 2020 * hoff is the number of bytes beyond the mbuf data pointer which 2021 * points to the IP header. 2022 * 2023 * Returns a bitmask representing all the work done in software. 2024 */ 2025uint32_t 2026in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags) 2027{ 2028 unsigned char buf[15 << 2] __attribute__((aligned(8))); 2029 struct ip *ip; 2030 uint32_t offset, _hlen, mlen, hlen, len, sw_csum; 2031 uint16_t csum, ip_len; 2032 2033 _CASSERT(sizeof (csum) == sizeof (uint16_t)); 2034 VERIFY(m->m_flags & M_PKTHDR); 2035 2036 sw_csum = (csum_flags & m->m_pkthdr.csum_flags); 2037 2038 if ((sw_csum &= (CSUM_DELAY_IP | CSUM_DELAY_DATA)) == 0) 2039 goto done; 2040 2041 mlen = m->m_pkthdr.len; /* total mbuf len */ 2042 2043 /* sanity check (need at least simple IP header) */ 2044 if (mlen < (hoff + sizeof (*ip))) { 2045 panic("%s: mbuf %p pkt len (%u) < hoff+ip_hdr " 2046 "(%u+%u)\n", __func__, m, mlen, hoff, 2047 (uint32_t)sizeof (*ip)); 2048 /* NOTREACHED */ 2049 } 2050 2051 /* 2052 * In case the IP header is not contiguous, or not 32-bit aligned, 2053 * or if we're computing the IP header checksum, copy it to a local 2054 * buffer. Copy only the simple IP header here (IP options case 2055 * is handled below.) 2056 */ 2057 if ((sw_csum & CSUM_DELAY_IP) || (hoff + sizeof (*ip)) > m->m_len || 2058 !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + hoff)) { 2059 m_copydata(m, hoff, sizeof (*ip), (caddr_t)buf); 2060 ip = (struct ip *)(void *)buf; 2061 _hlen = sizeof (*ip); 2062 } else { 2063 ip = (struct ip *)(void *)(m->m_data + hoff); 2064 _hlen = 0; 2065 } 2066 2067 hlen = IP_VHL_HL(ip->ip_vhl) << 2; /* IP header len */ 2068 2069 /* sanity check */ 2070 if (mlen < (hoff + hlen)) { 2071 panic("%s: mbuf %p pkt too short (%d) for IP header (%u), " 2072 "hoff %u", __func__, m, mlen, hlen, hoff); 2073 /* NOTREACHED */ 2074 } 2075 2076 /* 2077 * We could be in the context of an IP or interface filter; in the 2078 * former case, ip_len would be in host (correct) order while for 2079 * the latter it would be in network order. Because of this, we 2080 * attempt to interpret the length field by comparing it against 2081 * the actual packet length. If the comparison fails, byte swap 2082 * the length and check again. If it still fails, use the actual 2083 * packet length. This also covers the trailing bytes case. 2084 */ 2085 ip_len = ip->ip_len; 2086 if (ip_len != (mlen - hoff)) { 2087 ip_len = OSSwapInt16(ip_len); 2088 if (ip_len != (mlen - hoff)) { 2089 printf("%s: mbuf 0x%llx proto %d IP len %d (%x) " 2090 "[swapped %d (%x)] doesn't match actual packet " 2091 "length; %d is used instead\n", __func__, 2092 (uint64_t)VM_KERNEL_ADDRPERM(m), ip->ip_p, 2093 ip->ip_len, ip->ip_len, ip_len, ip_len, 2094 (mlen - hoff)); 2095 ip_len = mlen - hoff; 2096 } 2097 } 2098 2099 len = ip_len - hlen; /* csum span */ 2100 2101 if (sw_csum & CSUM_DELAY_DATA) { 2102 uint16_t ulpoff; 2103 2104 /* 2105 * offset is added to the lower 16-bit value of csum_data, 2106 * which is expected to contain the ULP offset; therefore 2107 * CSUM_PARTIAL offset adjustment must be undone. 2108 */ 2109 if ((m->m_pkthdr.csum_flags & (CSUM_PARTIAL|CSUM_DATA_VALID)) == 2110 (CSUM_PARTIAL|CSUM_DATA_VALID)) { 2111 /* 2112 * Get back the original ULP offset (this will 2113 * undo the CSUM_PARTIAL logic in ip_output.) 2114 */ 2115 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_tx_stuff - 2116 m->m_pkthdr.csum_tx_start); 2117 } 2118 2119 ulpoff = (m->m_pkthdr.csum_data & 0xffff); /* ULP csum offset */ 2120 offset = hoff + hlen; /* ULP header */ 2121 2122 if (mlen < (ulpoff + sizeof (csum))) { 2123 panic("%s: mbuf %p pkt len (%u) proto %d invalid ULP " 2124 "cksum offset (%u) cksum flags 0x%x\n", __func__, 2125 m, mlen, ip->ip_p, ulpoff, m->m_pkthdr.csum_flags); 2126 /* NOTREACHED */ 2127 } 2128 2129 csum = inet_cksum(m, 0, offset, len); 2130 2131 /* Update stats */ 2132 ip_out_cksum_stats(ip->ip_p, len); 2133 2134 /* RFC1122 4.1.3.4 */ 2135 if (csum == 0 && (m->m_pkthdr.csum_flags & CSUM_UDP)) 2136 csum = 0xffff; 2137 2138 /* Insert the checksum in the ULP csum field */ 2139 offset += ulpoff; 2140 if (offset + sizeof (csum) > m->m_len) { 2141 m_copyback(m, offset, sizeof (csum), &csum); 2142 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) { 2143 *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum; 2144 } else { 2145 bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum)); 2146 } 2147 m->m_pkthdr.csum_flags &= 2148 ~(CSUM_DELAY_DATA | CSUM_DATA_VALID | CSUM_PARTIAL); 2149 } 2150 2151 if (sw_csum & CSUM_DELAY_IP) { 2152 /* IP header must be in the local buffer */ 2153 VERIFY(_hlen == sizeof (*ip)); 2154 if (_hlen != hlen) { 2155 VERIFY(hlen <= sizeof (buf)); 2156 m_copydata(m, hoff, hlen, (caddr_t)buf); 2157 ip = (struct ip *)(void *)buf; 2158 _hlen = hlen; 2159 } 2160 2161 /* 2162 * Compute the IP header checksum as if the IP length 2163 * is the length which we believe is "correct"; see 2164 * how ip_len gets calculated above. Note that this 2165 * is done on the local copy and not on the real one. 2166 */ 2167 ip->ip_len = htons(ip_len); 2168 ip->ip_sum = 0; 2169 csum = in_cksum_hdr_opt(ip); 2170 2171 /* Update stats */ 2172 ipstat.ips_snd_swcsum++; 2173 ipstat.ips_snd_swcsum_bytes += hlen; 2174 2175 /* 2176 * Insert only the checksum in the existing IP header 2177 * csum field; all other fields are left unchanged. 2178 */ 2179 offset = hoff + offsetof(struct ip, ip_sum); 2180 if (offset + sizeof (csum) > m->m_len) { 2181 m_copyback(m, offset, sizeof (csum), &csum); 2182 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) { 2183 *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum; 2184 } else { 2185 bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum)); 2186 } 2187 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP; 2188 } 2189 2190done: 2191 return (sw_csum); 2192} 2193 2194/* 2195 * Insert IP options into preformed packet. 2196 * Adjust IP destination as required for IP source routing, 2197 * as indicated by a non-zero in_addr at the start of the options. 2198 * 2199 * XXX This routine assumes that the packet has no options in place. 2200 */ 2201static struct mbuf * 2202ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 2203{ 2204 struct ipoption *p = mtod(opt, struct ipoption *); 2205 struct mbuf *n; 2206 struct ip *ip = mtod(m, struct ip *); 2207 unsigned optlen; 2208 2209 optlen = opt->m_len - sizeof (p->ipopt_dst); 2210 if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) 2211 return (m); /* XXX should fail */ 2212 if (p->ipopt_dst.s_addr) 2213 ip->ip_dst = p->ipopt_dst; 2214 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { 2215 MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */ 2216 if (n == NULL) 2217 return (m); 2218 n->m_pkthdr.rcvif = 0; 2219#if CONFIG_MACF_NET 2220 mac_mbuf_label_copy(m, n); 2221#endif /* CONFIG_MACF_NET */ 2222 n->m_pkthdr.len = m->m_pkthdr.len + optlen; 2223 m->m_len -= sizeof (struct ip); 2224 m->m_data += sizeof (struct ip); 2225 n->m_next = m; 2226 m = n; 2227 m->m_len = optlen + sizeof (struct ip); 2228 m->m_data += max_linkhdr; 2229 (void) memcpy(mtod(m, void *), ip, sizeof (struct ip)); 2230 } else { 2231 m->m_data -= optlen; 2232 m->m_len += optlen; 2233 m->m_pkthdr.len += optlen; 2234 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof (struct ip)); 2235 } 2236 ip = mtod(m, struct ip *); 2237 bcopy(p->ipopt_list, ip + 1, optlen); 2238 *phlen = sizeof (struct ip) + optlen; 2239 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2); 2240 ip->ip_len += optlen; 2241 return (m); 2242} 2243 2244/* 2245 * Copy options from ip to jp, 2246 * omitting those not copied during fragmentation. 2247 */ 2248static int 2249ip_optcopy(struct ip *ip, struct ip *jp) 2250{ 2251 u_char *cp, *dp; 2252 int opt, optlen, cnt; 2253 2254 cp = (u_char *)(ip + 1); 2255 dp = (u_char *)(jp + 1); 2256 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip); 2257 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2258 opt = cp[0]; 2259 if (opt == IPOPT_EOL) 2260 break; 2261 if (opt == IPOPT_NOP) { 2262 /* Preserve for IP mcast tunnel's LSRR alignment. */ 2263 *dp++ = IPOPT_NOP; 2264 optlen = 1; 2265 continue; 2266 } 2267#if DIAGNOSTIC 2268 if (cnt < IPOPT_OLEN + sizeof (*cp)) { 2269 panic("malformed IPv4 option passed to ip_optcopy"); 2270 /* NOTREACHED */ 2271 } 2272#endif 2273 optlen = cp[IPOPT_OLEN]; 2274#if DIAGNOSTIC 2275 if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt) { 2276 panic("malformed IPv4 option passed to ip_optcopy"); 2277 /* NOTREACHED */ 2278 } 2279#endif 2280 /* bogus lengths should have been caught by ip_dooptions */ 2281 if (optlen > cnt) 2282 optlen = cnt; 2283 if (IPOPT_COPIED(opt)) { 2284 bcopy(cp, dp, optlen); 2285 dp += optlen; 2286 } 2287 } 2288 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 2289 *dp++ = IPOPT_EOL; 2290 return (optlen); 2291} 2292 2293/* 2294 * IP socket option processing. 2295 */ 2296int 2297ip_ctloutput(struct socket *so, struct sockopt *sopt) 2298{ 2299 struct inpcb *inp = sotoinpcb(so); 2300 int error, optval; 2301 2302 error = optval = 0; 2303 if (sopt->sopt_level != IPPROTO_IP) 2304 return (EINVAL); 2305 2306 switch (sopt->sopt_dir) { 2307 case SOPT_SET: 2308 switch (sopt->sopt_name) { 2309#ifdef notyet 2310 case IP_RETOPTS: 2311#endif 2312 case IP_OPTIONS: { 2313 struct mbuf *m; 2314 2315 if (sopt->sopt_valsize > MLEN) { 2316 error = EMSGSIZE; 2317 break; 2318 } 2319 MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT, 2320 MT_HEADER); 2321 if (m == NULL) { 2322 error = ENOBUFS; 2323 break; 2324 } 2325 m->m_len = sopt->sopt_valsize; 2326 error = sooptcopyin(sopt, mtod(m, char *), 2327 m->m_len, m->m_len); 2328 if (error) 2329 break; 2330 2331 return (ip_pcbopts(sopt->sopt_name, 2332 &inp->inp_options, m)); 2333 } 2334 2335 case IP_TOS: 2336 case IP_TTL: 2337 case IP_RECVOPTS: 2338 case IP_RECVRETOPTS: 2339 case IP_RECVDSTADDR: 2340 case IP_RECVIF: 2341 case IP_RECVTTL: 2342 case IP_RECVPKTINFO: 2343 error = sooptcopyin(sopt, &optval, sizeof (optval), 2344 sizeof (optval)); 2345 if (error) 2346 break; 2347 2348 switch (sopt->sopt_name) { 2349 case IP_TOS: 2350 inp->inp_ip_tos = optval; 2351 break; 2352 2353 case IP_TTL: 2354 inp->inp_ip_ttl = optval; 2355 break; 2356#define OPTSET(bit) \ 2357 if (optval) \ 2358 inp->inp_flags |= bit; \ 2359 else \ 2360 inp->inp_flags &= ~bit; 2361 2362 case IP_RECVOPTS: 2363 OPTSET(INP_RECVOPTS); 2364 break; 2365 2366 case IP_RECVRETOPTS: 2367 OPTSET(INP_RECVRETOPTS); 2368 break; 2369 2370 case IP_RECVDSTADDR: 2371 OPTSET(INP_RECVDSTADDR); 2372 break; 2373 2374 case IP_RECVIF: 2375 OPTSET(INP_RECVIF); 2376 break; 2377 2378 case IP_RECVTTL: 2379 OPTSET(INP_RECVTTL); 2380 break; 2381 2382 case IP_RECVPKTINFO: 2383 OPTSET(INP_PKTINFO); 2384 break; 2385 } 2386 break; 2387#undef OPTSET 2388 2389#if CONFIG_FORCE_OUT_IFP 2390 /* 2391 * Apple private interface, similar to IP_BOUND_IF, except 2392 * that the parameter is a NULL-terminated string containing 2393 * the name of the network interface; an emptry string means 2394 * unbind. Applications are encouraged to use IP_BOUND_IF 2395 * instead, as that is the current "official" API. 2396 */ 2397 case IP_FORCE_OUT_IFP: { 2398 char ifname[IFNAMSIZ]; 2399 unsigned int ifscope; 2400 2401 /* This option is settable only for IPv4 */ 2402 if (!(inp->inp_vflag & INP_IPV4)) { 2403 error = EINVAL; 2404 break; 2405 } 2406 2407 /* Verify interface name parameter is sane */ 2408 if (sopt->sopt_valsize > sizeof (ifname)) { 2409 error = EINVAL; 2410 break; 2411 } 2412 2413 /* Copy the interface name */ 2414 if (sopt->sopt_valsize != 0) { 2415 error = sooptcopyin(sopt, ifname, 2416 sizeof (ifname), sopt->sopt_valsize); 2417 if (error) 2418 break; 2419 } 2420 2421 if (sopt->sopt_valsize == 0 || ifname[0] == '\0') { 2422 /* Unbind this socket from any interface */ 2423 ifscope = IFSCOPE_NONE; 2424 } else { 2425 ifnet_t ifp; 2426 2427 /* Verify name is NULL terminated */ 2428 if (ifname[sopt->sopt_valsize - 1] != '\0') { 2429 error = EINVAL; 2430 break; 2431 } 2432 2433 /* Bail out if given bogus interface name */ 2434 if (ifnet_find_by_name(ifname, &ifp) != 0) { 2435 error = ENXIO; 2436 break; 2437 } 2438 2439 /* Bind this socket to this interface */ 2440 ifscope = ifp->if_index; 2441 2442 /* 2443 * Won't actually free; since we don't release 2444 * this later, we should do it now. 2445 */ 2446 ifnet_release(ifp); 2447 } 2448 error = inp_bindif(inp, ifscope, NULL); 2449 } 2450 break; 2451#endif /* CONFIG_FORCE_OUT_IFP */ 2452 /* 2453 * Multicast socket options are processed by the in_mcast 2454 * module. 2455 */ 2456 case IP_MULTICAST_IF: 2457 case IP_MULTICAST_IFINDEX: 2458 case IP_MULTICAST_VIF: 2459 case IP_MULTICAST_TTL: 2460 case IP_MULTICAST_LOOP: 2461 case IP_ADD_MEMBERSHIP: 2462 case IP_DROP_MEMBERSHIP: 2463 case IP_ADD_SOURCE_MEMBERSHIP: 2464 case IP_DROP_SOURCE_MEMBERSHIP: 2465 case IP_BLOCK_SOURCE: 2466 case IP_UNBLOCK_SOURCE: 2467 case IP_MSFILTER: 2468 case MCAST_JOIN_GROUP: 2469 case MCAST_LEAVE_GROUP: 2470 case MCAST_JOIN_SOURCE_GROUP: 2471 case MCAST_LEAVE_SOURCE_GROUP: 2472 case MCAST_BLOCK_SOURCE: 2473 case MCAST_UNBLOCK_SOURCE: 2474 error = inp_setmoptions(inp, sopt); 2475 break; 2476 2477 case IP_PORTRANGE: 2478 error = sooptcopyin(sopt, &optval, sizeof (optval), 2479 sizeof (optval)); 2480 if (error) 2481 break; 2482 2483 switch (optval) { 2484 case IP_PORTRANGE_DEFAULT: 2485 inp->inp_flags &= ~(INP_LOWPORT); 2486 inp->inp_flags &= ~(INP_HIGHPORT); 2487 break; 2488 2489 case IP_PORTRANGE_HIGH: 2490 inp->inp_flags &= ~(INP_LOWPORT); 2491 inp->inp_flags |= INP_HIGHPORT; 2492 break; 2493 2494 case IP_PORTRANGE_LOW: 2495 inp->inp_flags &= ~(INP_HIGHPORT); 2496 inp->inp_flags |= INP_LOWPORT; 2497 break; 2498 2499 default: 2500 error = EINVAL; 2501 break; 2502 } 2503 break; 2504 2505#if IPSEC 2506 case IP_IPSEC_POLICY: { 2507 caddr_t req = NULL; 2508 size_t len = 0; 2509 int priv; 2510 struct mbuf *m; 2511 int optname; 2512 2513 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ 2514 break; 2515 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ 2516 break; 2517 priv = (proc_suser(sopt->sopt_p) == 0); 2518 if (m) { 2519 req = mtod(m, caddr_t); 2520 len = m->m_len; 2521 } 2522 optname = sopt->sopt_name; 2523 error = ipsec4_set_policy(inp, optname, req, len, priv); 2524 m_freem(m); 2525 break; 2526 } 2527#endif /* IPSEC */ 2528 2529#if TRAFFIC_MGT 2530 case IP_TRAFFIC_MGT_BACKGROUND: { 2531 unsigned background = 0; 2532 2533 error = sooptcopyin(sopt, &background, 2534 sizeof (background), sizeof (background)); 2535 if (error) 2536 break; 2537 2538 if (background) { 2539 socket_set_traffic_mgt_flags_locked(so, 2540 TRAFFIC_MGT_SO_BACKGROUND); 2541 } else { 2542 socket_clear_traffic_mgt_flags_locked(so, 2543 TRAFFIC_MGT_SO_BACKGROUND); 2544 } 2545 2546 break; 2547 } 2548#endif /* TRAFFIC_MGT */ 2549 2550 /* 2551 * On a multihomed system, scoped routing can be used to 2552 * restrict the source interface used for sending packets. 2553 * The socket option IP_BOUND_IF binds a particular AF_INET 2554 * socket to an interface such that data sent on the socket 2555 * is restricted to that interface. This is unlike the 2556 * SO_DONTROUTE option where the routing table is bypassed; 2557 * therefore it allows for a greater flexibility and control 2558 * over the system behavior, and does not place any restriction 2559 * on the destination address type (e.g. unicast, multicast, 2560 * or broadcast if applicable) or whether or not the host is 2561 * directly reachable. Note that in the multicast transmit 2562 * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over 2563 * IP_BOUND_IF, since the former practically bypasses the 2564 * routing table; in this case, IP_BOUND_IF sets the default 2565 * interface used for sending multicast packets in the absence 2566 * of an explicit multicast transmit interface. 2567 */ 2568 case IP_BOUND_IF: 2569 /* This option is settable only for IPv4 */ 2570 if (!(inp->inp_vflag & INP_IPV4)) { 2571 error = EINVAL; 2572 break; 2573 } 2574 2575 error = sooptcopyin(sopt, &optval, sizeof (optval), 2576 sizeof (optval)); 2577 2578 if (error) 2579 break; 2580 2581 error = inp_bindif(inp, optval, NULL); 2582 break; 2583 2584 case IP_NO_IFT_CELLULAR: 2585 /* This option is settable only for IPv4 */ 2586 if (!(inp->inp_vflag & INP_IPV4)) { 2587 error = EINVAL; 2588 break; 2589 } 2590 2591 error = sooptcopyin(sopt, &optval, sizeof (optval), 2592 sizeof (optval)); 2593 2594 if (error) 2595 break; 2596 2597 /* once set, it cannot be unset */ 2598 if (!optval && (inp->inp_flags & INP_NO_IFT_CELLULAR)) { 2599 error = EINVAL; 2600 break; 2601 } 2602 2603 error = so_set_restrictions(so, 2604 SO_RESTRICT_DENY_CELLULAR); 2605 break; 2606 2607 case IP_OUT_IF: 2608 /* This option is not settable */ 2609 error = EINVAL; 2610 break; 2611 2612 default: 2613 error = ENOPROTOOPT; 2614 break; 2615 } 2616 break; 2617 2618 case SOPT_GET: 2619 switch (sopt->sopt_name) { 2620 case IP_OPTIONS: 2621 case IP_RETOPTS: 2622 if (inp->inp_options) { 2623 error = sooptcopyout(sopt, 2624 mtod(inp->inp_options, char *), 2625 inp->inp_options->m_len); 2626 } else { 2627 sopt->sopt_valsize = 0; 2628 } 2629 break; 2630 2631 case IP_TOS: 2632 case IP_TTL: 2633 case IP_RECVOPTS: 2634 case IP_RECVRETOPTS: 2635 case IP_RECVDSTADDR: 2636 case IP_RECVIF: 2637 case IP_RECVTTL: 2638 case IP_PORTRANGE: 2639 case IP_RECVPKTINFO: 2640 switch (sopt->sopt_name) { 2641 2642 case IP_TOS: 2643 optval = inp->inp_ip_tos; 2644 break; 2645 2646 case IP_TTL: 2647 optval = inp->inp_ip_ttl; 2648 break; 2649 2650#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 2651 2652 case IP_RECVOPTS: 2653 optval = OPTBIT(INP_RECVOPTS); 2654 break; 2655 2656 case IP_RECVRETOPTS: 2657 optval = OPTBIT(INP_RECVRETOPTS); 2658 break; 2659 2660 case IP_RECVDSTADDR: 2661 optval = OPTBIT(INP_RECVDSTADDR); 2662 break; 2663 2664 case IP_RECVIF: 2665 optval = OPTBIT(INP_RECVIF); 2666 break; 2667 2668 case IP_RECVTTL: 2669 optval = OPTBIT(INP_RECVTTL); 2670 break; 2671 2672 case IP_PORTRANGE: 2673 if (inp->inp_flags & INP_HIGHPORT) 2674 optval = IP_PORTRANGE_HIGH; 2675 else if (inp->inp_flags & INP_LOWPORT) 2676 optval = IP_PORTRANGE_LOW; 2677 else 2678 optval = 0; 2679 break; 2680 2681 case IP_RECVPKTINFO: 2682 optval = OPTBIT(INP_PKTINFO); 2683 break; 2684 } 2685 error = sooptcopyout(sopt, &optval, sizeof (optval)); 2686 break; 2687 2688 case IP_MULTICAST_IF: 2689 case IP_MULTICAST_IFINDEX: 2690 case IP_MULTICAST_VIF: 2691 case IP_MULTICAST_TTL: 2692 case IP_MULTICAST_LOOP: 2693 case IP_MSFILTER: 2694 error = inp_getmoptions(inp, sopt); 2695 break; 2696 2697#if IPSEC 2698 case IP_IPSEC_POLICY: { 2699 struct mbuf *m = NULL; 2700 caddr_t req = NULL; 2701 size_t len = 0; 2702 2703 if (m != NULL) { 2704 req = mtod(m, caddr_t); 2705 len = m->m_len; 2706 } 2707 error = ipsec4_get_policy(sotoinpcb(so), req, len, &m); 2708 if (error == 0) 2709 error = soopt_mcopyout(sopt, m); /* XXX */ 2710 if (error == 0) 2711 m_freem(m); 2712 break; 2713 } 2714#endif /* IPSEC */ 2715 2716#if TRAFFIC_MGT 2717 case IP_TRAFFIC_MGT_BACKGROUND: { 2718 unsigned background = (so->so_traffic_mgt_flags & 2719 TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0; 2720 return (sooptcopyout(sopt, &background, 2721 sizeof (background))); 2722 break; 2723 } 2724#endif /* TRAFFIC_MGT */ 2725 2726 case IP_BOUND_IF: 2727 if (inp->inp_flags & INP_BOUND_IF) 2728 optval = inp->inp_boundifp->if_index; 2729 error = sooptcopyout(sopt, &optval, sizeof (optval)); 2730 break; 2731 2732 case IP_NO_IFT_CELLULAR: 2733 optval = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; 2734 error = sooptcopyout(sopt, &optval, sizeof (optval)); 2735 break; 2736 2737 case IP_OUT_IF: 2738 optval = (inp->inp_last_outifp != NULL) ? 2739 inp->inp_last_outifp->if_index : 0; 2740 error = sooptcopyout(sopt, &optval, sizeof (optval)); 2741 break; 2742 2743 default: 2744 error = ENOPROTOOPT; 2745 break; 2746 } 2747 break; 2748 } 2749 return (error); 2750} 2751 2752/* 2753 * Set up IP options in pcb for insertion in output packets. 2754 * Store in mbuf with pointer in pcbopt, adding pseudo-option 2755 * with destination address if source routed. 2756 */ 2757static int 2758ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m) 2759{ 2760#pragma unused(optname) 2761 int cnt, optlen; 2762 u_char *cp; 2763 u_char opt; 2764 2765 /* turn off any old options */ 2766 if (*pcbopt) 2767 (void) m_free(*pcbopt); 2768 *pcbopt = 0; 2769 if (m == (struct mbuf *)0 || m->m_len == 0) { 2770 /* 2771 * Only turning off any previous options. 2772 */ 2773 if (m) 2774 (void) m_free(m); 2775 return (0); 2776 } 2777 2778 if (m->m_len % sizeof (int32_t)) 2779 goto bad; 2780 2781 /* 2782 * IP first-hop destination address will be stored before 2783 * actual options; move other options back 2784 * and clear it when none present. 2785 */ 2786 if (m->m_data + m->m_len + sizeof (struct in_addr) >= &m->m_dat[MLEN]) 2787 goto bad; 2788 cnt = m->m_len; 2789 m->m_len += sizeof (struct in_addr); 2790 cp = mtod(m, u_char *) + sizeof (struct in_addr); 2791 ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt); 2792 bzero(mtod(m, caddr_t), sizeof (struct in_addr)); 2793 2794 for (; cnt > 0; cnt -= optlen, cp += optlen) { 2795 opt = cp[IPOPT_OPTVAL]; 2796 if (opt == IPOPT_EOL) 2797 break; 2798 if (opt == IPOPT_NOP) 2799 optlen = 1; 2800 else { 2801 if (cnt < IPOPT_OLEN + sizeof (*cp)) 2802 goto bad; 2803 optlen = cp[IPOPT_OLEN]; 2804 if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt) 2805 goto bad; 2806 } 2807 switch (opt) { 2808 2809 default: 2810 break; 2811 2812 case IPOPT_LSRR: 2813 case IPOPT_SSRR: 2814 /* 2815 * user process specifies route as: 2816 * ->A->B->C->D 2817 * D must be our final destination (but we can't 2818 * check that since we may not have connected yet). 2819 * A is first hop destination, which doesn't appear in 2820 * actual IP option, but is stored before the options. 2821 */ 2822 if (optlen < IPOPT_MINOFF - 1 + sizeof (struct in_addr)) 2823 goto bad; 2824 m->m_len -= sizeof (struct in_addr); 2825 cnt -= sizeof (struct in_addr); 2826 optlen -= sizeof (struct in_addr); 2827 cp[IPOPT_OLEN] = optlen; 2828 /* 2829 * Move first hop before start of options. 2830 */ 2831 bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), 2832 sizeof (struct in_addr)); 2833 /* 2834 * Then copy rest of options back 2835 * to close up the deleted entry. 2836 */ 2837 ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] + 2838 sizeof (struct in_addr)), 2839 (caddr_t)&cp[IPOPT_OFFSET+1], 2840 (unsigned)cnt + sizeof (struct in_addr)); 2841 break; 2842 } 2843 } 2844 if (m->m_len > MAX_IPOPTLEN + sizeof (struct in_addr)) 2845 goto bad; 2846 *pcbopt = m; 2847 return (0); 2848 2849bad: 2850 (void) m_free(m); 2851 return (EINVAL); 2852} 2853 2854void 2855ip_moptions_init(void) 2856{ 2857 PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug)); 2858 2859 imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) : 2860 sizeof (struct ip_moptions_dbg); 2861 2862 imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0, 2863 IMO_ZONE_NAME); 2864 if (imo_zone == NULL) { 2865 panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME); 2866 /* NOTREACHED */ 2867 } 2868 zone_change(imo_zone, Z_EXPAND, TRUE); 2869} 2870 2871void 2872imo_addref(struct ip_moptions *imo, int locked) 2873{ 2874 if (!locked) 2875 IMO_LOCK(imo); 2876 else 2877 IMO_LOCK_ASSERT_HELD(imo); 2878 2879 if (++imo->imo_refcnt == 0) { 2880 panic("%s: imo %p wraparound refcnt\n", __func__, imo); 2881 /* NOTREACHED */ 2882 } else if (imo->imo_trace != NULL) { 2883 (*imo->imo_trace)(imo, TRUE); 2884 } 2885 2886 if (!locked) 2887 IMO_UNLOCK(imo); 2888} 2889 2890void 2891imo_remref(struct ip_moptions *imo) 2892{ 2893 int i; 2894 2895 IMO_LOCK(imo); 2896 if (imo->imo_refcnt == 0) { 2897 panic("%s: imo %p negative refcnt", __func__, imo); 2898 /* NOTREACHED */ 2899 } else if (imo->imo_trace != NULL) { 2900 (*imo->imo_trace)(imo, FALSE); 2901 } 2902 2903 --imo->imo_refcnt; 2904 if (imo->imo_refcnt > 0) { 2905 IMO_UNLOCK(imo); 2906 return; 2907 } 2908 2909 for (i = 0; i < imo->imo_num_memberships; ++i) { 2910 struct in_mfilter *imf; 2911 2912 imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL; 2913 if (imf != NULL) 2914 imf_leave(imf); 2915 2916 (void) in_leavegroup(imo->imo_membership[i], imf); 2917 2918 if (imf != NULL) 2919 imf_purge(imf); 2920 2921 INM_REMREF(imo->imo_membership[i]); 2922 imo->imo_membership[i] = NULL; 2923 } 2924 imo->imo_num_memberships = 0; 2925 if (imo->imo_mfilters != NULL) { 2926 FREE(imo->imo_mfilters, M_INMFILTER); 2927 imo->imo_mfilters = NULL; 2928 } 2929 if (imo->imo_membership != NULL) { 2930 FREE(imo->imo_membership, M_IPMOPTS); 2931 imo->imo_membership = NULL; 2932 } 2933 IMO_UNLOCK(imo); 2934 2935 lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp); 2936 2937 if (!(imo->imo_debug & IFD_ALLOC)) { 2938 panic("%s: imo %p cannot be freed", __func__, imo); 2939 /* NOTREACHED */ 2940 } 2941 zfree(imo_zone, imo); 2942} 2943 2944static void 2945imo_trace(struct ip_moptions *imo, int refhold) 2946{ 2947 struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo; 2948 ctrace_t *tr; 2949 u_int32_t idx; 2950 u_int16_t *cnt; 2951 2952 if (!(imo->imo_debug & IFD_DEBUG)) { 2953 panic("%s: imo %p has no debug structure", __func__, imo); 2954 /* NOTREACHED */ 2955 } 2956 if (refhold) { 2957 cnt = &imo_dbg->imo_refhold_cnt; 2958 tr = imo_dbg->imo_refhold; 2959 } else { 2960 cnt = &imo_dbg->imo_refrele_cnt; 2961 tr = imo_dbg->imo_refrele; 2962 } 2963 2964 idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE; 2965 ctrace_record(&tr[idx]); 2966} 2967 2968struct ip_moptions * 2969ip_allocmoptions(int how) 2970{ 2971 struct ip_moptions *imo; 2972 2973 imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone); 2974 if (imo != NULL) { 2975 bzero(imo, imo_size); 2976 lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr); 2977 imo->imo_debug |= IFD_ALLOC; 2978 if (imo_debug != 0) { 2979 imo->imo_debug |= IFD_DEBUG; 2980 imo->imo_trace = imo_trace; 2981 } 2982 IMO_ADDREF(imo); 2983 } 2984 2985 return (imo); 2986} 2987 2988/* 2989 * Routine called from ip_output() to loop back a copy of an IP multicast 2990 * packet to the input queue of a specified interface. Note that this 2991 * calls the output routine of the loopback "driver", but with an interface 2992 * pointer that might NOT be a loopback interface -- evil, but easier than 2993 * replicating that code here. 2994 */ 2995static void 2996ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m, 2997 struct sockaddr_in *dst, int hlen) 2998{ 2999 struct mbuf *copym; 3000 struct ip *ip; 3001 3002 if (lo_ifp == NULL) 3003 return; 3004 3005 /* 3006 * Copy the packet header as it's needed for the checksum 3007 * Make sure to deep-copy IP header portion in case the data 3008 * is in an mbuf cluster, so that we can safely override the IP 3009 * header portion later. 3010 */ 3011 copym = m_copym_mode(m, 0, M_COPYALL, M_DONTWAIT, M_COPYM_COPY_HDR); 3012 if (copym != NULL && ((copym->m_flags & M_EXT) || copym->m_len < hlen)) 3013 copym = m_pullup(copym, hlen); 3014 3015 if (copym == NULL) 3016 return; 3017 3018 /* 3019 * We don't bother to fragment if the IP length is greater 3020 * than the interface's MTU. Can this possibly matter? 3021 */ 3022 ip = mtod(copym, struct ip *); 3023#if BYTE_ORDER != BIG_ENDIAN 3024 HTONS(ip->ip_len); 3025 HTONS(ip->ip_off); 3026#endif 3027 ip->ip_sum = 0; 3028 ip->ip_sum = ip_cksum_hdr_out(copym, hlen); 3029 3030 /* 3031 * Mark checksum as valid unless receive checksum offload is 3032 * disabled; if so, compute checksum in software. If the 3033 * interface itself is lo0, this will be overridden by if_loop. 3034 */ 3035 if (hwcksum_rx) { 3036 copym->m_pkthdr.csum_flags &= ~CSUM_PARTIAL; 3037 copym->m_pkthdr.csum_flags |= 3038 CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 3039 copym->m_pkthdr.csum_data = 0xffff; 3040 } else if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 3041#if BYTE_ORDER != BIG_ENDIAN 3042 NTOHS(ip->ip_len); 3043#endif 3044 in_delayed_cksum(copym); 3045#if BYTE_ORDER != BIG_ENDIAN 3046 HTONS(ip->ip_len); 3047#endif 3048 } 3049 3050 /* 3051 * Stuff the 'real' ifp into the pkthdr, to be used in matching 3052 * in ip_input(); we need the loopback ifp/dl_tag passed as args 3053 * to make the loopback driver compliant with the data link 3054 * requirements. 3055 */ 3056 copym->m_pkthdr.rcvif = origifp; 3057 3058 /* 3059 * Also record the source interface (which owns the source address). 3060 * This is basically a stripped down version of ifa_foraddr(). 3061 */ 3062 if (srcifp == NULL) { 3063 struct in_ifaddr *ia; 3064 3065 lck_rw_lock_shared(in_ifaddr_rwlock); 3066 TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) { 3067 IFA_LOCK_SPIN(&ia->ia_ifa); 3068 if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) { 3069 srcifp = ia->ia_ifp; 3070 IFA_UNLOCK(&ia->ia_ifa); 3071 break; 3072 } 3073 IFA_UNLOCK(&ia->ia_ifa); 3074 } 3075 lck_rw_done(in_ifaddr_rwlock); 3076 } 3077 if (srcifp != NULL) 3078 ip_setsrcifaddr_info(copym, srcifp->if_index, NULL); 3079 ip_setdstifaddr_info(copym, origifp->if_index, NULL); 3080 3081 dlil_output(lo_ifp, PF_INET, copym, NULL, SA(dst), 0, NULL); 3082} 3083 3084/* 3085 * Given a source IP address (and route, if available), determine the best 3086 * interface to send the packet from. Checking for (and updating) the 3087 * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done 3088 * without any locks based on the assumption that ip_output() is single- 3089 * threaded per-pcb, i.e. for any given pcb there can only be one thread 3090 * performing output at the IP layer. 3091 * 3092 * This routine is analogous to in6_selectroute() for IPv6. 3093 */ 3094static struct ifaddr * 3095in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) 3096{ 3097 struct ifaddr *ifa = NULL; 3098 struct in_addr src = ip->ip_src; 3099 struct in_addr dst = ip->ip_dst; 3100 struct ifnet *rt_ifp; 3101 char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN]; 3102 3103 VERIFY(src.s_addr != INADDR_ANY); 3104 3105 if (ip_select_srcif_debug) { 3106 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src)); 3107 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst)); 3108 } 3109 3110 if (ro->ro_rt != NULL) 3111 RT_LOCK(ro->ro_rt); 3112 3113 rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL; 3114 3115 /* 3116 * Given the source IP address, find a suitable source interface 3117 * to use for transmission; if the caller has specified a scope, 3118 * optimize the search by looking at the addresses only for that 3119 * interface. This is still suboptimal, however, as we need to 3120 * traverse the per-interface list. 3121 */ 3122 if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) { 3123 unsigned int scope = ifscope; 3124 3125 /* 3126 * If no scope is specified and the route is stale (pointing 3127 * to a defunct interface) use the current primary interface; 3128 * this happens when switching between interfaces configured 3129 * with the same IP address. Otherwise pick up the scope 3130 * information from the route; the ULP may have looked up a 3131 * correct route and we just need to verify it here and mark 3132 * it with the ROF_SRCIF_SELECTED flag below. 3133 */ 3134 if (scope == IFSCOPE_NONE) { 3135 scope = rt_ifp->if_index; 3136 if (scope != get_primary_ifscope(AF_INET) && 3137 ROUTE_UNUSABLE(ro)) 3138 scope = get_primary_ifscope(AF_INET); 3139 } 3140 3141 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope); 3142 3143 if (ifa == NULL && ip->ip_p != IPPROTO_UDP && 3144 ip->ip_p != IPPROTO_TCP && ipforwarding) { 3145 /* 3146 * If forwarding is enabled, and if the packet isn't 3147 * TCP or UDP, check if the source address belongs 3148 * to one of our own interfaces; if so, demote the 3149 * interface scope and do a route lookup right below. 3150 */ 3151 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); 3152 if (ifa != NULL) { 3153 IFA_REMREF(ifa); 3154 ifa = NULL; 3155 ifscope = IFSCOPE_NONE; 3156 } 3157 } 3158 3159 if (ip_select_srcif_debug && ifa != NULL) { 3160 if (ro->ro_rt != NULL) { 3161 printf("%s->%s ifscope %d->%d ifa_if %s " 3162 "ro_if %s\n", s_src, s_dst, ifscope, 3163 scope, if_name(ifa->ifa_ifp), 3164 if_name(rt_ifp)); 3165 } else { 3166 printf("%s->%s ifscope %d->%d ifa_if %s\n", 3167 s_src, s_dst, ifscope, scope, 3168 if_name(ifa->ifa_ifp)); 3169 } 3170 } 3171 } 3172 3173 /* 3174 * Slow path; search for an interface having the corresponding source 3175 * IP address if the scope was not specified by the caller, and: 3176 * 3177 * 1) There currently isn't any route, or, 3178 * 2) The interface used by the route does not own that source 3179 * IP address; in this case, the route will get blown away 3180 * and we'll do a more specific scoped search using the newly 3181 * found interface. 3182 */ 3183 if (ifa == NULL && ifscope == IFSCOPE_NONE) { 3184 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); 3185 3186 /* 3187 * If we have the IP address, but not the route, we don't 3188 * really know whether or not it belongs to the correct 3189 * interface (it could be shared across multiple interfaces.) 3190 * The only way to find out is to do a route lookup. 3191 */ 3192 if (ifa != NULL && ro->ro_rt == NULL) { 3193 struct rtentry *rt; 3194 struct sockaddr_in sin; 3195 struct ifaddr *oifa = NULL; 3196 3197 bzero(&sin, sizeof (sin)); 3198 sin.sin_family = AF_INET; 3199 sin.sin_len = sizeof (sin); 3200 sin.sin_addr = dst; 3201 3202 lck_mtx_lock(rnh_lock); 3203 if ((rt = rt_lookup(TRUE, SA(&sin), NULL, 3204 rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) { 3205 RT_LOCK(rt); 3206 /* 3207 * If the route uses a different interface, 3208 * use that one instead. The IP address of 3209 * the ifaddr that we pick up here is not 3210 * relevant. 3211 */ 3212 if (ifa->ifa_ifp != rt->rt_ifp) { 3213 oifa = ifa; 3214 ifa = rt->rt_ifa; 3215 IFA_ADDREF(ifa); 3216 RT_UNLOCK(rt); 3217 } else { 3218 RT_UNLOCK(rt); 3219 } 3220 rtfree_locked(rt); 3221 } 3222 lck_mtx_unlock(rnh_lock); 3223 3224 if (oifa != NULL) { 3225 struct ifaddr *iifa; 3226 3227 /* 3228 * See if the interface pointed to by the 3229 * route is configured with the source IP 3230 * address of the packet. 3231 */ 3232 iifa = (struct ifaddr *)ifa_foraddr_scoped( 3233 src.s_addr, ifa->ifa_ifp->if_index); 3234 3235 if (iifa != NULL) { 3236 /* 3237 * Found it; drop the original one 3238 * as well as the route interface 3239 * address, and use this instead. 3240 */ 3241 IFA_REMREF(oifa); 3242 IFA_REMREF(ifa); 3243 ifa = iifa; 3244 } else if (!ipforwarding || 3245 (rt->rt_flags & RTF_GATEWAY)) { 3246 /* 3247 * This interface doesn't have that 3248 * source IP address; drop the route 3249 * interface address and just use the 3250 * original one, and let the caller 3251 * do a scoped route lookup. 3252 */ 3253 IFA_REMREF(ifa); 3254 ifa = oifa; 3255 } else { 3256 /* 3257 * Forwarding is enabled and the source 3258 * address belongs to one of our own 3259 * interfaces which isn't the outgoing 3260 * interface, and we have a route, and 3261 * the destination is on a network that 3262 * is directly attached (onlink); drop 3263 * the original one and use the route 3264 * interface address instead. 3265 */ 3266 IFA_REMREF(oifa); 3267 } 3268 } 3269 } else if (ifa != NULL && ro->ro_rt != NULL && 3270 !(ro->ro_rt->rt_flags & RTF_GATEWAY) && 3271 ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) { 3272 /* 3273 * Forwarding is enabled and the source address belongs 3274 * to one of our own interfaces which isn't the same 3275 * as the interface used by the known route; drop the 3276 * original one and use the route interface address. 3277 */ 3278 IFA_REMREF(ifa); 3279 ifa = ro->ro_rt->rt_ifa; 3280 IFA_ADDREF(ifa); 3281 } 3282 3283 if (ip_select_srcif_debug && ifa != NULL) { 3284 printf("%s->%s ifscope %d ifa_if %s\n", 3285 s_src, s_dst, ifscope, if_name(ifa->ifa_ifp)); 3286 } 3287 } 3288 3289 if (ro->ro_rt != NULL) 3290 RT_LOCK_ASSERT_HELD(ro->ro_rt); 3291 /* 3292 * If there is a non-loopback route with the wrong interface, or if 3293 * there is no interface configured with such an address, blow it 3294 * away. Except for local/loopback, we look for one with a matching 3295 * interface scope/index. 3296 */ 3297 if (ro->ro_rt != NULL && 3298 (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) || 3299 !(ro->ro_rt->rt_flags & RTF_UP))) { 3300 if (ip_select_srcif_debug) { 3301 if (ifa != NULL) { 3302 printf("%s->%s ifscope %d ro_if %s != " 3303 "ifa_if %s (cached route cleared)\n", 3304 s_src, s_dst, ifscope, if_name(rt_ifp), 3305 if_name(ifa->ifa_ifp)); 3306 } else { 3307 printf("%s->%s ifscope %d ro_if %s " 3308 "(no ifa_if found)\n", 3309 s_src, s_dst, ifscope, if_name(rt_ifp)); 3310 } 3311 } 3312 3313 RT_UNLOCK(ro->ro_rt); 3314 ROUTE_RELEASE(ro); 3315 3316 /* 3317 * If the destination is IPv4 LLA and the route's interface 3318 * doesn't match the source interface, then the source IP 3319 * address is wrong; it most likely belongs to the primary 3320 * interface associated with the IPv4 LL subnet. Drop the 3321 * packet rather than letting it go out and return an error 3322 * to the ULP. This actually applies not only to IPv4 LL 3323 * but other shared subnets; for now we explicitly test only 3324 * for the former case and save the latter for future. 3325 */ 3326 if (IN_LINKLOCAL(ntohl(dst.s_addr)) && 3327 !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) { 3328 IFA_REMREF(ifa); 3329 ifa = NULL; 3330 } 3331 } 3332 3333 if (ip_select_srcif_debug && ifa == NULL) { 3334 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n", 3335 s_src, s_dst, ifscope); 3336 } 3337 3338 /* 3339 * If there is a route, mark it accordingly. If there isn't one, 3340 * we'll get here again during the next transmit (possibly with a 3341 * route) and the flag will get set at that point. For IPv4 LLA 3342 * destination, mark it only if the route has been fully resolved; 3343 * otherwise we want to come back here again when the route points 3344 * to the interface over which the ARP reply arrives on. 3345 */ 3346 if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) || 3347 (ro->ro_rt->rt_gateway->sa_family == AF_LINK && 3348 SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) { 3349 if (ifa != NULL) 3350 IFA_ADDREF(ifa); /* for route */ 3351 if (ro->ro_srcia != NULL) 3352 IFA_REMREF(ro->ro_srcia); 3353 ro->ro_srcia = ifa; 3354 ro->ro_flags |= ROF_SRCIF_SELECTED; 3355 RT_GENID_SYNC(ro->ro_rt); 3356 } 3357 3358 if (ro->ro_rt != NULL) 3359 RT_UNLOCK(ro->ro_rt); 3360 3361 return (ifa); 3362} 3363 3364void 3365ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len, 3366 uint32_t *sw_csum) 3367{ 3368 int tso = TSO_IPV4_OK(ifp, m); 3369 uint32_t hwcap = ifp->if_hwassist; 3370 3371 m->m_pkthdr.csum_flags |= CSUM_IP; 3372 3373 if (!hwcksum_tx) { 3374 /* do all in software; hardware checksum offload is disabled */ 3375 *sw_csum = (CSUM_DELAY_DATA | CSUM_DELAY_IP) & 3376 m->m_pkthdr.csum_flags; 3377 } else { 3378 /* do in software what the hardware cannot */ 3379 *sw_csum = m->m_pkthdr.csum_flags & 3380 ~IF_HWASSIST_CSUM_FLAGS(hwcap); 3381 } 3382 3383 if (hlen != sizeof (struct ip)) { 3384 *sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) & 3385 m->m_pkthdr.csum_flags); 3386 } else if (!(*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) { 3387 /* 3388 * Partial checksum offload, if non-IP fragment, and TCP only 3389 * (no UDP support, as the hardware may not be able to convert 3390 * +0 to -0 (0xffff) per RFC1122 4.1.3.4.) 3391 */ 3392 if (hwcksum_tx && !tso && 3393 (m->m_pkthdr.csum_flags & CSUM_TCP) && 3394 ip_len <= ifp->if_mtu) { 3395 uint16_t start = sizeof (struct ip); 3396 uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff; 3397 m->m_pkthdr.csum_flags |= 3398 (CSUM_DATA_VALID | CSUM_PARTIAL); 3399 m->m_pkthdr.csum_tx_stuff = (ulpoff + start); 3400 m->m_pkthdr.csum_tx_start = start; 3401 /* do IP hdr chksum in software */ 3402 *sw_csum = CSUM_DELAY_IP; 3403 } else { 3404 *sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags); 3405 } 3406 } 3407 3408 if (*sw_csum & CSUM_DELAY_DATA) { 3409 in_delayed_cksum(m); 3410 *sw_csum &= ~CSUM_DELAY_DATA; 3411 } 3412 3413 if (hwcksum_tx) { 3414 /* 3415 * Drop off bits that aren't supported by hardware; 3416 * also make sure to preserve non-checksum related bits. 3417 */ 3418 m->m_pkthdr.csum_flags = 3419 ((m->m_pkthdr.csum_flags & 3420 (IF_HWASSIST_CSUM_FLAGS(hwcap) | CSUM_DATA_VALID)) | 3421 (m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_MASK)); 3422 } else { 3423 /* drop all bits; hardware checksum offload is disabled */ 3424 m->m_pkthdr.csum_flags = 0; 3425 } 3426} 3427 3428/* 3429 * GRE protocol output for PPP/PPTP 3430 */ 3431int 3432ip_gre_output(struct mbuf *m) 3433{ 3434 struct route ro; 3435 int error; 3436 3437 bzero(&ro, sizeof (ro)); 3438 3439 error = ip_output(m, NULL, &ro, 0, NULL, NULL); 3440 3441 ROUTE_RELEASE(&ro); 3442 3443 return (error); 3444} 3445